From b86b0b150fed840c376145383ef5105116c81b0c Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 27 Mar 2020 11:32:14 -0700
Subject: Bluetooth: L2CAP: Fix handling LE modes by L2CAP_OPTIONS

L2CAP_OPTIONS shall only be used with BR/EDR modes.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/l2cap_sock.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 117ba20ea194..cfb402645c26 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -424,6 +424,20 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
 			break;
 		}
 
+		/* Only BR/EDR modes are supported here */
+		switch (chan->mode) {
+		case L2CAP_MODE_BASIC:
+		case L2CAP_MODE_ERTM:
+		case L2CAP_MODE_STREAMING:
+			break;
+		default:
+			err = -EINVAL;
+			break;
+		}
+
+		if (err < 0)
+			break;
+
 		memset(&opts, 0, sizeof(opts));
 		opts.imtu     = chan->imtu;
 		opts.omtu     = chan->omtu;
@@ -698,10 +712,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
 			break;
 		}
 
-		chan->mode = opts.mode;
-		switch (chan->mode) {
-		case L2CAP_MODE_LE_FLOWCTL:
-			break;
+		/* Only BR/EDR modes are supported here */
+		switch (opts.mode) {
 		case L2CAP_MODE_BASIC:
 			clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
 			break;
@@ -715,6 +727,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
 			break;
 		}
 
+		if (err < 0)
+			break;
+
+		chan->mode = opts.mode;
+
 		BT_DBG("mode 0x%2.2x", chan->mode);
 
 		chan->imtu = opts.imtu;
-- 
cgit v1.2.3


From 3ee7b7cd83900bb711efadbf16fa096a615a1566 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 27 Mar 2020 11:32:15 -0700
Subject: Bluetooth: Add BT_MODE socket option

This adds BT_MODE socket option which can be used to set L2CAP modes,
including modes only supported over LE which were not supported using
the L2CAP_OPTIONS.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h |   8 +++
 net/bluetooth/l2cap_sock.c        | 113 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 1576353a2773..3fa7b1e3c5d9 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -139,6 +139,14 @@ struct bt_voice {
 #define BT_PHY_LE_CODED_TX	0x00002000
 #define BT_PHY_LE_CODED_RX	0x00004000
 
+#define BT_MODE			15
+
+#define BT_MODE_BASIC		0x00
+#define BT_MODE_ERTM		0x01
+#define BT_MODE_STREAMING	0x02
+#define BT_MODE_LE_FLOWCTL	0x03
+#define BT_MODE_EXT_FLOWCTL	0x04
+
 __printf(1, 2)
 void bt_info(const char *fmt, ...);
 __printf(1, 2)
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index cfb402645c26..1cea42ee1e92 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -395,6 +395,24 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
 	return sizeof(struct sockaddr_l2);
 }
 
+static int l2cap_get_mode(struct l2cap_chan *chan)
+{
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		return BT_MODE_BASIC;
+	case L2CAP_MODE_ERTM:
+		return BT_MODE_ERTM;
+	case L2CAP_MODE_STREAMING:
+		return BT_MODE_STREAMING;
+	case L2CAP_MODE_LE_FLOWCTL:
+		return BT_MODE_LE_FLOWCTL;
+	case L2CAP_MODE_EXT_FLOWCTL:
+		return BT_MODE_EXT_FLOWCTL;
+	}
+
+	return -EINVAL;
+}
+
 static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
 				     char __user *optval, int __user *optlen)
 {
@@ -522,7 +540,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
 	struct bt_security sec;
 	struct bt_power pwr;
 	u32 phys;
-	int len, err = 0;
+	int len, mode, err = 0;
 
 	BT_DBG("sk %p", sk);
 
@@ -638,6 +656,27 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
 			err = -EFAULT;
 		break;
 
+	case BT_MODE:
+		if (!enable_ecred) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+			err = -EINVAL;
+			break;
+		}
+
+		mode = l2cap_get_mode(chan);
+		if (mode < 0) {
+			err = mode;
+			break;
+		}
+
+		if (put_user(mode, (u8 __user *) optval))
+			err = -EFAULT;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -780,6 +819,45 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
 	return err;
 }
 
+static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode)
+{
+	switch (mode) {
+	case BT_MODE_BASIC:
+		if (bdaddr_type_is_le(chan->src_type))
+			return -EINVAL;
+		mode = L2CAP_MODE_BASIC;
+		clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+		break;
+	case BT_MODE_ERTM:
+		if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+			return -EINVAL;
+		mode = L2CAP_MODE_ERTM;
+		break;
+	case BT_MODE_STREAMING:
+		if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+			return -EINVAL;
+		mode = L2CAP_MODE_STREAMING;
+		break;
+	case BT_MODE_LE_FLOWCTL:
+		if (!bdaddr_type_is_le(chan->src_type))
+			return -EINVAL;
+		mode = L2CAP_MODE_LE_FLOWCTL;
+		break;
+	case BT_MODE_EXT_FLOWCTL:
+		/* TODO: Add support for ECRED PDUs to BR/EDR */
+		if (!bdaddr_type_is_le(chan->src_type))
+			return -EINVAL;
+		mode = L2CAP_MODE_EXT_FLOWCTL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	chan->mode = mode;
+
+	return 0;
+}
+
 static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
 				 char __user *optval, unsigned int optlen)
 {
@@ -985,6 +1063,39 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
 
 		break;
 
+	case BT_MODE:
+		if (!enable_ecred) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+
+		BT_DBG("sk->sk_state %u", sk->sk_state);
+
+		if (sk->sk_state != BT_BOUND) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u8 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		BT_DBG("opt %u", opt);
+
+		err = l2cap_set_mode(chan, opt);
+		if (err)
+			break;
+
+		BT_DBG("mode 0x%2.2x", chan->mode);
+
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 92516cd97fd4d8ad5b1421a0d51771044f453a5f Mon Sep 17 00:00:00 2001
From: Sonny Sasaka <sonnysasaka@gmail.com>
Date: Fri, 27 Mar 2020 17:34:23 -0700
Subject: Bluetooth: Always request for user confirmation for Just Works

To improve security, always give the user-space daemon a chance to
accept or reject a Just Works pairing (LE). The daemon may decide to
auto-accept based on the user's intent.

Signed-off-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 1476a91ce935..d0b695ee49f6 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -855,6 +855,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 	struct smp_chan *smp = chan->data;
 	u32 passkey = 0;
 	int ret = 0;
+	int err;
 
 	/* Initialize key for JUST WORKS */
 	memset(smp->tk, 0, sizeof(smp->tk));
@@ -883,9 +884,16 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 	    hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
 		smp->method = JUST_WORKS;
 
-	/* If Just Works, Continue with Zero TK */
+	/* If Just Works, Continue with Zero TK and ask user-space for
+	 * confirmation */
 	if (smp->method == JUST_WORKS) {
-		set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+		err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+						hcon->type,
+						hcon->dst_type,
+						passkey, 1);
+		if (err)
+			return SMP_UNSPECIFIED;
+		set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 7fedd3bb6b77f9b6eefb0e4dcd8f79d0d00b86d7 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Mon, 23 Mar 2020 12:45:07 -0700
Subject: Bluetooth: Prioritize SCO traffic

When scheduling TX packets, send all SCO/eSCO packets first, check for
pending SCO/eSCO packets after every ACL/LE packet and send them if any
are pending.  This is done to make sure that we can meet SCO deadlines
on slow interfaces like UART.

If we were to queue up multiple ACL packets without checking for a SCO
packet, we might miss the SCO timing. For example:

The time it takes to send a maximum size ACL packet (1024 bytes):
t = 10/8 * 1024 bytes * 8 bits/byte * 1 packet / baudrate
        where 10/8 is uart overhead due to start/stop bits per byte

Replace t = 3.75ms (SCO deadline), which gives us a baudrate of 2730666.

At a baudrate of 3000000, if we didn't check for SCO packets within 1024
bytes, we would miss the 3.75ms timing window.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c | 106 +++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2e7bc2da8371..5fb9db0b2b7b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -4240,6 +4240,54 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt)
 	}
 }
 
+/* Schedule SCO */
+static void hci_sched_sco(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!hci_conn_num(hdev, SCO_LINK))
+		return;
+
+	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+			hci_send_frame(hdev, skb);
+
+			conn->sent++;
+			if (conn->sent == ~0)
+				conn->sent = 0;
+		}
+	}
+}
+
+static void hci_sched_esco(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!hci_conn_num(hdev, ESCO_LINK))
+		return;
+
+	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK,
+						     &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+			hci_send_frame(hdev, skb);
+
+			conn->sent++;
+			if (conn->sent == ~0)
+				conn->sent = 0;
+		}
+	}
+}
+
 static void hci_sched_acl_pkt(struct hci_dev *hdev)
 {
 	unsigned int cnt = hdev->acl_cnt;
@@ -4271,6 +4319,10 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
 			hdev->acl_cnt--;
 			chan->sent++;
 			chan->conn->sent++;
+
+			/* Send pending SCO packets right away */
+			hci_sched_sco(hdev);
+			hci_sched_esco(hdev);
 		}
 	}
 
@@ -4355,54 +4407,6 @@ static void hci_sched_acl(struct hci_dev *hdev)
 	}
 }
 
-/* Schedule SCO */
-static void hci_sched_sco(struct hci_dev *hdev)
-{
-	struct hci_conn *conn;
-	struct sk_buff *skb;
-	int quote;
-
-	BT_DBG("%s", hdev->name);
-
-	if (!hci_conn_num(hdev, SCO_LINK))
-		return;
-
-	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
-		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
-			BT_DBG("skb %p len %d", skb, skb->len);
-			hci_send_frame(hdev, skb);
-
-			conn->sent++;
-			if (conn->sent == ~0)
-				conn->sent = 0;
-		}
-	}
-}
-
-static void hci_sched_esco(struct hci_dev *hdev)
-{
-	struct hci_conn *conn;
-	struct sk_buff *skb;
-	int quote;
-
-	BT_DBG("%s", hdev->name);
-
-	if (!hci_conn_num(hdev, ESCO_LINK))
-		return;
-
-	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK,
-						     &quote))) {
-		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
-			BT_DBG("skb %p len %d", skb, skb->len);
-			hci_send_frame(hdev, skb);
-
-			conn->sent++;
-			if (conn->sent == ~0)
-				conn->sent = 0;
-		}
-	}
-}
-
 static void hci_sched_le(struct hci_dev *hdev)
 {
 	struct hci_chan *chan;
@@ -4437,6 +4441,10 @@ static void hci_sched_le(struct hci_dev *hdev)
 			cnt--;
 			chan->sent++;
 			chan->conn->sent++;
+
+			/* Send pending SCO packets right away */
+			hci_sched_sco(hdev);
+			hci_sched_esco(hdev);
 		}
 	}
 
@@ -4459,9 +4467,9 @@ static void hci_tx_work(struct work_struct *work)
 
 	if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
 		/* Schedule queues and send stuff to HCI driver */
-		hci_sched_acl(hdev);
 		hci_sched_sco(hdev);
 		hci_sched_esco(hdev);
+		hci_sched_acl(hdev);
 		hci_sched_le(hdev);
 	}
 
-- 
cgit v1.2.3


From 1e5479be46a70389e1059818a2e9358858eaa5fc Mon Sep 17 00:00:00 2001
From: Alain Michaud <alainm@chromium.org>
Date: Fri, 3 Apr 2020 13:49:05 +0000
Subject: Bluetooth: fixing minor typo in comment

This changes a simple typo in hci_event.c

Signed-off-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0a591be8b0ae..ddf77304aa8e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5269,7 +5269,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
 		case HCI_AUTO_CONN_ALWAYS:
 			/* Devices advertising with ADV_IND or ADV_DIRECT_IND
 			 * are triggering a connection attempt. This means
-			 * that incoming connectioms from slave device are
+			 * that incoming connections from slave device are
 			 * accepted and also outgoing connections to slave
 			 * devices are established when found.
 			 */
-- 
cgit v1.2.3


From 1f8330ea1692c9c490b1e566e31d96d8cef99dd8 Mon Sep 17 00:00:00 2001
From: Sathish Narsimman <nsathish41@gmail.com>
Date: Fri, 3 Apr 2020 21:43:58 +0200
Subject: Bluetooth: add support to notify using SCO air mode

notifying using HCI_NOTIFY_CONN_ADD for SCO connection is generic in
case of mSBC audio. To differntiate SCO air mode introducing
HCI_NOTIFY_ENABLE_SCO_CVSD and HCI_NOTIFY_ENABLE_SCO_TRANSP.

Signed-off-by: Sathish Narsimman <sathish.narasimman@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h |  3 +++
 net/bluetooth/hci_conn.c    | 25 +++++++++++++++++++++----
 net/bluetooth/hci_event.c   | 23 ++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 5f60e135aeb6..9ff2f7a9e131 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -53,6 +53,9 @@
 #define HCI_NOTIFY_CONN_ADD		1
 #define HCI_NOTIFY_CONN_DEL		2
 #define HCI_NOTIFY_VOICE_SETTING	3
+#define HCI_NOTIFY_ENABLE_SCO_CVSD	4
+#define HCI_NOTIFY_ENABLE_SCO_TRANSP	5
+#define HCI_NOTIFY_DISABLE_SCO		6
 
 /* HCI bus types */
 #define HCI_VIRTUAL	0
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index e245bc155cc2..07c34c55fc50 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -122,8 +122,18 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 
 	hci_conn_hash_del(hdev, conn);
 
-	if (hdev->notify)
-		hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
+	if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+		switch (conn->setting & SCO_AIRMODE_MASK) {
+		case SCO_AIRMODE_CVSD:
+		case SCO_AIRMODE_TRANSP:
+			if (hdev->notify)
+				hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO);
+			break;
+		}
+	} else {
+		if (hdev->notify)
+			hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
+	}
 
 	hci_conn_del_sysfs(conn);
 
@@ -577,8 +587,15 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	hci_dev_hold(hdev);
 
 	hci_conn_hash_add(hdev, conn);
-	if (hdev->notify)
-		hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+
+	/* The SCO and eSCO connections will only be notified when their
+	 * setup has been completed. This is different to ACL links which
+	 * can be notified right away.
+	 */
+	if (conn->type != SCO_LINK && conn->type != ESCO_LINK) {
+		if (hdev->notify)
+			hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+	}
 
 	hci_conn_init_sysfs(conn);
 
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ddf77304aa8e..af396cb69602 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2607,8 +2607,16 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	if (ev->status) {
 		hci_connect_cfm(conn, ev->status);
 		hci_conn_del(conn);
-	} else if (ev->link_type != ACL_LINK)
+	} else if (ev->link_type == SCO_LINK) {
+		switch (conn->setting & SCO_AIRMODE_MASK) {
+		case SCO_AIRMODE_CVSD:
+			if (hdev->notify)
+				hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+			break;
+		}
+
 		hci_connect_cfm(conn, ev->status);
+	}
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -4307,6 +4315,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
 		break;
 	}
 
+	bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode);
+
+	switch (conn->setting & SCO_AIRMODE_MASK) {
+	case SCO_AIRMODE_CVSD:
+		if (hdev->notify)
+			hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+		break;
+	case SCO_AIRMODE_TRANSP:
+		if (hdev->notify)
+			hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP);
+		break;
+	}
+
 	hci_connect_cfm(conn, ev->status);
 	if (ev->status)
 		hci_conn_del(conn);
-- 
cgit v1.2.3


From 3d2336042ae3555d4b77995402291c5795882d20 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 3 Apr 2020 21:44:00 +0200
Subject: Bluetooth: Move debugfs configuration above the selftests

This is just a cosmetic clean to move the selftests configuration option
to the bottom of the list of options.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/Kconfig | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 165148c7c4ce..77703216a2e3 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -93,6 +93,14 @@ config BT_LEDS
 	  This option selects a few LED triggers for different
 	  Bluetooth events.
 
+config BT_DEBUGFS
+	bool "Export Bluetooth internals in debugfs"
+	depends on BT && DEBUG_FS
+	default y
+	help
+	  Provide extensive information about internal Bluetooth states
+	  in debugfs.
+
 config BT_SELFTEST
 	bool "Bluetooth self testing support"
 	depends on BT && DEBUG_KERNEL
@@ -120,12 +128,4 @@ config BT_SELFTEST_SMP
 	  Run test cases for SMP cryptographic functionality, including both
 	  legacy SMP as well as the Secure Connections features.
 
-config BT_DEBUGFS
-	bool "Export Bluetooth internals in debugfs"
-	depends on BT && DEBUG_FS
-	default y
-	help
-	  Provide extensive information about internal Bluetooth states
-	  in debugfs.
-
 source "drivers/bluetooth/Kconfig"
-- 
cgit v1.2.3


From 145373cb1b1fcdba2059e945d0aa2613af2e84d1 Mon Sep 17 00:00:00 2001
From: Miao-chen Chou <mcchou@chromium.org>
Date: Fri, 3 Apr 2020 21:44:01 +0200
Subject: Bluetooth: Add framework for Microsoft vendor extension

Micrsoft defined a set for HCI vendor extensions. Check the following
link for details:

https://docs.microsoft.com/en-us/windows-hardware/drivers/bluetooth/microsoft-defined-bluetooth-hci-commands-and-events

This provides the basic framework to enable the extension and read its
supported features. Drivers still have to declare support for this
extension before it can be utilized by the host stack.

Signed-off-by: Miao-chen Chou <mcchou@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  13 ++++
 net/bluetooth/Kconfig            |   7 ++
 net/bluetooth/Makefile           |   1 +
 net/bluetooth/hci_core.c         |   5 ++
 net/bluetooth/hci_event.c        |   5 ++
 net/bluetooth/msft.c             | 141 +++++++++++++++++++++++++++++++++++++++
 net/bluetooth/msft.h             |  18 +++++
 7 files changed, 190 insertions(+)
 create mode 100644 net/bluetooth/msft.c
 create mode 100644 net/bluetooth/msft.h

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d4e28773d378..3cb0f82d0c83 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -484,6 +484,11 @@ struct hci_dev {
 	struct led_trigger	*power_led;
 #endif
 
+#if IS_ENABLED(CONFIG_BT_MSFTEXT)
+	__u16			msft_opcode;
+	void			*msft_data;
+#endif
+
 	int (*open)(struct hci_dev *hdev);
 	int (*close)(struct hci_dev *hdev);
 	int (*flush)(struct hci_dev *hdev);
@@ -1116,6 +1121,14 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb);
 int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb);
 __printf(2, 3) void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...);
 __printf(2, 3) void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...);
+
+static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode)
+{
+#if IS_ENABLED(CONFIG_BT_MSFTEXT)
+	hdev->msft_opcode = opcode;
+#endif
+}
+
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 77703216a2e3..9e25c6570170 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -93,6 +93,13 @@ config BT_LEDS
 	  This option selects a few LED triggers for different
 	  Bluetooth events.
 
+config BT_MSFTEXT
+	bool "Enable Microsoft extensions"
+	depends on BT
+	help
+	  This options enables support for the Microsoft defined HCI
+	  vendor extensions.
+
 config BT_DEBUGFS
 	bool "Export Bluetooth internals in debugfs"
 	depends on BT && DEBUG_FS
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index fda41c0b4781..41dd541a44a5 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -19,5 +19,6 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
 bluetooth-$(CONFIG_BT_LEDS) += leds.o
+bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
 bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
 bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 5fb9db0b2b7b..ef0ee3a3d9ed 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -44,6 +44,7 @@
 #include "hci_debugfs.h"
 #include "smp.h"
 #include "leds.h"
+#include "msft.h"
 
 static void hci_rx_work(struct work_struct *work);
 static void hci_cmd_work(struct work_struct *work);
@@ -1563,6 +1564,8 @@ setup_failed:
 	    hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
 		ret = hdev->set_diag(hdev, true);
 
+	msft_do_open(hdev);
+
 	clear_bit(HCI_INIT, &hdev->flags);
 
 	if (!ret) {
@@ -1758,6 +1761,8 @@ int hci_dev_do_close(struct hci_dev *hdev)
 
 	hci_sock_dev_event(hdev, HCI_DEV_DOWN);
 
+	msft_do_close(hdev);
+
 	if (hdev->flush)
 		hdev->flush(hdev);
 
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index af396cb69602..2803beaa1c44 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -35,6 +35,7 @@
 #include "a2mp.h"
 #include "amp.h"
 #include "smp.h"
+#include "msft.h"
 
 #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
 		 "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -6166,6 +6167,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_num_comp_blocks_evt(hdev, skb);
 		break;
 
+	case HCI_EV_VENDOR:
+		msft_vendor_evt(hdev, skb);
+		break;
+
 	default:
 		BT_DBG("%s event 0x%2.2x", hdev->name, event);
 		break;
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
new file mode 100644
index 000000000000..d6c4e6b5ae77
--- /dev/null
+++ b/net/bluetooth/msft.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "msft.h"
+
+#define MSFT_OP_READ_SUPPORTED_FEATURES		0x00
+struct msft_cp_read_supported_features {
+	__u8   sub_opcode;
+} __packed;
+struct msft_rp_read_supported_features {
+	__u8   status;
+	__u8   sub_opcode;
+	__le64 features;
+	__u8   evt_prefix_len;
+	__u8   evt_prefix[0];
+} __packed;
+
+struct msft_data {
+	__u64 features;
+	__u8  evt_prefix_len;
+	__u8  *evt_prefix;
+};
+
+static bool read_supported_features(struct hci_dev *hdev,
+				    struct msft_data *msft)
+{
+	struct msft_cp_read_supported_features cp;
+	struct msft_rp_read_supported_features *rp;
+	struct sk_buff *skb;
+
+	cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES;
+
+	skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+			     HCI_CMD_TIMEOUT);
+	if (IS_ERR(skb)) {
+		bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)",
+			   PTR_ERR(skb));
+		return false;
+	}
+
+	if (skb->len < sizeof(*rp)) {
+		bt_dev_err(hdev, "MSFT supported features length mismatch");
+		goto failed;
+	}
+
+	rp = (struct msft_rp_read_supported_features *)skb->data;
+
+	if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES)
+		goto failed;
+
+	if (rp->evt_prefix_len > 0) {
+		msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len,
+					   GFP_KERNEL);
+		if (!msft->evt_prefix)
+			goto failed;
+	}
+
+	msft->evt_prefix_len = rp->evt_prefix_len;
+	msft->features = __le64_to_cpu(rp->features);
+
+	kfree_skb(skb);
+	return true;
+
+failed:
+	kfree_skb(skb);
+	return false;
+}
+
+void msft_do_open(struct hci_dev *hdev)
+{
+	struct msft_data *msft;
+
+	if (hdev->msft_opcode == HCI_OP_NOP)
+		return;
+
+	bt_dev_dbg(hdev, "Initialize MSFT extension");
+
+	msft = kzalloc(sizeof(*msft), GFP_KERNEL);
+	if (!msft)
+		return;
+
+	if (!read_supported_features(hdev, msft)) {
+		kfree(msft);
+		return;
+	}
+
+	hdev->msft_data = msft;
+}
+
+void msft_do_close(struct hci_dev *hdev)
+{
+	struct msft_data *msft = hdev->msft_data;
+
+	if (!msft)
+		return;
+
+	bt_dev_dbg(hdev, "Cleanup of MSFT extension");
+
+	hdev->msft_data = NULL;
+
+	kfree(msft->evt_prefix);
+	kfree(msft);
+}
+
+void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct msft_data *msft = hdev->msft_data;
+	u8 event;
+
+	if (!msft)
+		return;
+
+	/* When the extension has defined an event prefix, check that it
+	 * matches, and otherwise just return.
+	 */
+	if (msft->evt_prefix_len > 0) {
+		if (skb->len < msft->evt_prefix_len)
+			return;
+
+		if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len))
+			return;
+
+		skb_pull(skb, msft->evt_prefix_len);
+	}
+
+	/* Every event starts at least with an event code and the rest of
+	 * the data is variable and depends on the event code.
+	 */
+	if (skb->len < 1)
+		return;
+
+	event = *skb->data;
+	skb_pull(skb, 1);
+
+	bt_dev_dbg(hdev, "MSFT vendor event %u", event);
+}
diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h
new file mode 100644
index 000000000000..5aa9130e1f8a
--- /dev/null
+++ b/net/bluetooth/msft.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#if IS_ENABLED(CONFIG_BT_MSFTEXT)
+
+void msft_do_open(struct hci_dev *hdev);
+void msft_do_close(struct hci_dev *hdev);
+void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb);
+
+#else
+
+static inline void msft_do_open(struct hci_dev *hdev) {}
+static inline void msft_do_close(struct hci_dev *hdev) {}
+static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {}
+
+#endif
-- 
cgit v1.2.3


From a479036041d6a1bcf98f72b16a425e8d45e20ae9 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 3 Apr 2020 21:44:04 +0200
Subject: Bluetooth: Add support for Read Local Simple Pairing Options

With the Read Local Simple Pairing Options command it is possible to
retrieve the support for max encryption key size supported by the
controller and also if the controller correctly verifies the ECDH public
key during pairing.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h      |  7 +++++++
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         |  4 ++++
 net/bluetooth/hci_event.c        | 21 +++++++++++++++++++++
 4 files changed, 34 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 9ff2f7a9e131..086a9e9d5d03 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1275,6 +1275,13 @@ struct hci_rp_read_data_block_size {
 
 #define HCI_OP_READ_LOCAL_CODECS	0x100b
 
+#define HCI_OP_READ_LOCAL_PAIRING_OPTS	0x100c
+struct hci_rp_read_local_pairing_opts {
+	__u8     status;
+	__u8     pairing_opts;
+	__u8     max_key_size;
+} __packed;
+
 #define HCI_OP_READ_PAGE_SCAN_ACTIVITY	0x0c1b
 struct hci_rp_read_page_scan_activity {
 	__u8     status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 3cb0f82d0c83..2f3275f1d1c4 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -312,6 +312,8 @@ struct hci_dev {
 	__u16		conn_info_max_age;
 	__u16		auth_payload_timeout;
 	__u8		min_enc_key_size;
+	__u8		max_enc_key_size;
+	__u8		pairing_opts;
 	__u8		ssp_debug_mode;
 	__u8		hw_error_code;
 	__u32		clock;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ef0ee3a3d9ed..589c4085499c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -827,6 +827,10 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
 	if (hdev->commands[29] & 0x20)
 		hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL);
 
+	/* Read local pairing options if the HCI command is supported */
+	if (hdev->commands[41] & 0x08)
+		hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL);
+
 	/* Get MWS transport configuration if the HCI command is supported */
 	if (hdev->commands[30] & 0x08)
 		hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 2803beaa1c44..51e6461f0b71 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -747,6 +747,23 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb)
 		bacpy(&hdev->setup_addr, &rp->bdaddr);
 }
 
+static void hci_cc_read_local_pairing_opts(struct hci_dev *hdev,
+					   struct sk_buff *skb)
+{
+	struct hci_rp_read_local_pairing_opts *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+	    hci_dev_test_flag(hdev, HCI_CONFIG)) {
+		hdev->pairing_opts = rp->pairing_opts;
+		hdev->max_enc_key_size = rp->max_key_size;
+	}
+}
+
 static void hci_cc_read_page_scan_activity(struct hci_dev *hdev,
 					   struct sk_buff *skb)
 {
@@ -3343,6 +3360,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
 		hci_cc_read_bd_addr(hdev, skb);
 		break;
 
+	case HCI_OP_READ_LOCAL_PAIRING_OPTS:
+		hci_cc_read_local_pairing_opts(hdev, skb);
+		break;
+
 	case HCI_OP_READ_PAGE_SCAN_ACTIVITY:
 		hci_cc_read_page_scan_activity(hdev, skb);
 		break;
-- 
cgit v1.2.3


From bc292258c580a82c9baef0a64f66971e010a40a9 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 3 Apr 2020 21:44:05 +0200
Subject: Bluetooth: Add support for reading security information

To allow userspace to make correcty security policy decision, the kernel
needs to export a few details of the supported security features and
encryption key size information. This command exports this information
and also allows future extensions if needed.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Reviewed-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/mgmt.h |  7 ++++++
 net/bluetooth/mgmt.c         | 53 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index f41cd87550dc..65dd6fd1fff3 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -674,6 +674,13 @@ struct mgmt_cp_set_blocked_keys {
 
 #define MGMT_OP_SET_WIDEBAND_SPEECH	0x0047
 
+#define MGMT_OP_READ_SECURITY_INFO	0x0048
+#define MGMT_READ_SECURITY_INFO_SIZE	0
+struct mgmt_rp_read_security_info {
+	__le16   sec_len;
+	__u8     sec[0];
+} __packed;
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 6552003a170e..7b9eac339c87 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -108,6 +108,7 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_SET_APPEARANCE,
 	MGMT_OP_SET_BLOCKED_KEYS,
 	MGMT_OP_SET_WIDEBAND_SPEECH,
+	MGMT_OP_READ_SECURITY_INFO,
 };
 
 static const u16 mgmt_events[] = {
@@ -155,6 +156,7 @@ static const u16 mgmt_untrusted_commands[] = {
 	MGMT_OP_READ_CONFIG_INFO,
 	MGMT_OP_READ_EXT_INDEX_LIST,
 	MGMT_OP_READ_EXT_INFO,
+	MGMT_OP_READ_SECURITY_INFO,
 };
 
 static const u16 mgmt_untrusted_events[] = {
@@ -3659,6 +3661,55 @@ unlock:
 	return err;
 }
 
+static int read_security_info(struct sock *sk, struct hci_dev *hdev,
+			      void *data, u16 data_len)
+{
+	char buf[16];
+	struct mgmt_rp_read_security_info *rp = (void *)buf;
+	u16 sec_len = 0;
+	u8 flags = 0;
+
+	bt_dev_dbg(hdev, "sock %p", sk);
+
+	memset(&buf, 0, sizeof(buf));
+
+	hci_dev_lock(hdev);
+
+	/* When the Read Simple Pairing Options command is supported, then
+	 * the remote public key validation is supported.
+	 */
+	if (hdev->commands[41] & 0x08)
+		flags |= 0x01;	/* Remote public key validation (BR/EDR) */
+
+	flags |= 0x02;		/* Remote public key validation (LE) */
+
+	/* When the Read Encryption Key Size command is supported, then the
+	 * encryption key size is enforced.
+	 */
+	if (hdev->commands[20] & 0x10)
+		flags |= 0x04;	/* Encryption key size enforcement (BR/EDR) */
+
+	flags |= 0x08;		/* Encryption key size enforcement (LE) */
+
+	sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1);
+
+	/* When the Read Simple Pairing Options command is supported, then
+	 * also max encryption key size information is provided.
+	 */
+	if (hdev->commands[41] & 0x08)
+		sec_len = eir_append_le16(rp->sec, sec_len, 0x02,
+					  hdev->max_enc_key_size);
+
+	sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE);
+
+	rp->sec_len = cpu_to_le16(sec_len);
+
+	hci_dev_unlock(hdev);
+
+	return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0,
+				 rp, sizeof(*rp) + sec_len);
+}
+
 static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
 				         u16 opcode, struct sk_buff *skb)
 {
@@ -7099,6 +7150,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ set_blocked_keys,	   MGMT_OP_SET_BLOCKED_KEYS_SIZE,
 						HCI_MGMT_VAR_LEN },
 	{ set_wideband_speech,	   MGMT_SETTING_SIZE },
+	{ read_security_info,      MGMT_READ_SECURITY_INFO_SIZE,
+						HCI_MGMT_UNTRUSTED },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
-- 
cgit v1.2.3


From 3679fe7d43c65e07f00afb216987f33e152ceb6f Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 3 Apr 2020 21:44:06 +0200
Subject: Bluetooth: Increment management interface revision

Increment the mgmt revision due to the recently added new commands.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7b9eac339c87..f8c0a4fc8090 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
 #include "mgmt_util.h"
 
 #define MGMT_VERSION	1
-#define MGMT_REVISION	16
+#define MGMT_REVISION	17
 
 static const u16 mgmt_commands[] = {
 	MGMT_OP_READ_INDEX_LIST,
-- 
cgit v1.2.3


From c2aa30db744d9cbdde127d4ed8aeea18273834c6 Mon Sep 17 00:00:00 2001
From: Archie Pusaka <apusaka@chromium.org>
Date: Tue, 7 Apr 2020 12:26:27 +0800
Subject: Bluetooth: debugfs option to unset MITM flag

The BT qualification test SM/MAS/PKE/BV-01-C needs us to turn off
the MITM flag when pairing, and at the same time also set the io
capability to something other than no input no output.

Currently the MITM flag is only unset when the io capability is set
to no input no output, therefore the test cannot be executed.

This patch introduces a debugfs option to force MITM flag to be
turned off.

Signed-off-by: Archie Pusaka <apusaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_debugfs.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/smp.c         | 15 ++++++++++-----
 3 files changed, 57 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 79de2a659dd6..f4e8e2a0b7c1 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -298,6 +298,7 @@ enum {
 	HCI_FORCE_STATIC_ADDR,
 	HCI_LL_RPA_RESOLUTION,
 	HCI_CMD_PENDING,
+	HCI_FORCE_NO_MITM,
 
 	__HCI_NUM_FLAGS,
 };
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 6b1314c738b8..5e8af2658e44 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -1075,6 +1075,50 @@ DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops,
 			auth_payload_timeout_get,
 			auth_payload_timeout_set, "%llu\n");
 
+static ssize_t force_no_mitm_read(struct file *file,
+				  char __user *user_buf,
+				  size_t count, loff_t *ppos)
+{
+	struct hci_dev *hdev = file->private_data;
+	char buf[3];
+
+	buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_no_mitm_write(struct file *file,
+				   const char __user *user_buf,
+				   size_t count, loff_t *ppos)
+{
+	struct hci_dev *hdev = file->private_data;
+	char buf[32];
+	size_t buf_size = min(count, (sizeof(buf) - 1));
+	bool enable;
+
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	buf[buf_size] = '\0';
+	if (strtobool(buf, &enable))
+		return -EINVAL;
+
+	if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM))
+		return -EALREADY;
+
+	hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM);
+
+	return count;
+}
+
+static const struct file_operations force_no_mitm_fops = {
+	.open		= simple_open,
+	.read		= force_no_mitm_read,
+	.write		= force_no_mitm_write,
+	.llseek		= default_llseek,
+};
+
 DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
 		       HCI_QUIRK_STRICT_DUPLICATE_FILTER);
 DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery,
@@ -1134,6 +1178,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
 			    &max_key_size_fops);
 	debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
 			    &auth_payload_timeout_fops);
+	debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev,
+			    &force_no_mitm_fops);
 
 	debugfs_create_file("quirk_strict_duplicate_filter", 0644,
 			    hdev->debugfs, hdev,
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index d0b695ee49f6..a85e3e49cd0d 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -2393,12 +2393,17 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 			authreq |= SMP_AUTH_CT2;
 	}
 
-	/* Require MITM if IO Capability allows or the security level
-	 * requires it.
+	/* Don't attempt to set MITM if setting is overridden by debugfs
+	 * Needed to pass certification test SM/MAS/PKE/BV-01-C
 	 */
-	if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
-	    hcon->pending_sec_level > BT_SECURITY_MEDIUM)
-		authreq |= SMP_AUTH_MITM;
+	if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) {
+		/* Require MITM if IO Capability allows or the security level
+		 * requires it.
+		 */
+		if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
+		    hcon->pending_sec_level > BT_SECURITY_MEDIUM)
+			authreq |= SMP_AUTH_MITM;
+	}
 
 	if (hcon->role == HCI_ROLE_MASTER) {
 		struct smp_cmd_pairing cp;
-- 
cgit v1.2.3


From d1d900f822b6b2874de9c1ef8094fc8df56a2f9f Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 6 Apr 2020 11:54:38 -0700
Subject: Bluetooth: Simplify / fix return values from tk_request

Some static checker run by 0day reports a variableScope warning.

net/bluetooth/smp.c:870:6: warning:
	The scope of the variable 'err' can be reduced. [variableScope]

There is no need for two separate variables holding return values.
Stick with the existing variable. While at it, don't pre-initialize
'ret' because it is set in each code path.

tk_request() is supposed to return a negative error code on errors,
not a bluetooth return code. The calling code converts the return
value to SMP_UNSPECIFIED if needed.

Fixes: 92516cd97fd4 ("Bluetooth: Always request for user confirmation for Just Works")
Cc: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index a85e3e49cd0d..daf198fb2b31 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -854,8 +854,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 	struct l2cap_chan *chan = conn->smp;
 	struct smp_chan *smp = chan->data;
 	u32 passkey = 0;
-	int ret = 0;
-	int err;
+	int ret;
 
 	/* Initialize key for JUST WORKS */
 	memset(smp->tk, 0, sizeof(smp->tk));
@@ -887,12 +886,12 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
 	/* If Just Works, Continue with Zero TK and ask user-space for
 	 * confirmation */
 	if (smp->method == JUST_WORKS) {
-		err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+		ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
 						hcon->type,
 						hcon->dst_type,
 						passkey, 1);
-		if (err)
-			return SMP_UNSPECIFIED;
+		if (ret)
+			return ret;
 		set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
 		return 0;
 	}
-- 
cgit v1.2.3


From ffee202a78c2980688bc5d2f7d56480e69a5e0c9 Mon Sep 17 00:00:00 2001
From: Sonny Sasaka <sonnysasaka@chromium.org>
Date: Mon, 6 Apr 2020 11:04:02 -0700
Subject: Bluetooth: Always request for user confirmation for Just Works (LE
 SC)

To improve security, always give the user-space daemon a chance to
accept or reject a Just Works pairing (LE). The daemon may decide to
auto-accept based on the user's intent.

This patch is similar to the previous patch but applies for LE Secure
Connections (SC).

Signed-off-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index daf198fb2b31..df22cbf94693 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -2201,7 +2201,7 @@ mackey_and_ltk:
 	if (err)
 		return SMP_UNSPECIFIED;
 
-	if (smp->method == JUST_WORKS || smp->method == REQ_OOB) {
+	if (smp->method == REQ_OOB) {
 		if (hcon->out) {
 			sc_dhkey_check(smp);
 			SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
@@ -2216,6 +2216,9 @@ mackey_and_ltk:
 	confirm_hint = 0;
 
 confirm:
+	if (smp->method == JUST_WORKS)
+		confirm_hint = 1;
+
 	err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
 					hcon->dst_type, passkey, confirm_hint);
 	if (err)
-- 
cgit v1.2.3


From 943d5d92c5e87aa8293aae6de2b3ee977aa7d3cf Mon Sep 17 00:00:00 2001
From: Daniels Umanovskis <du@axentia.se>
Date: Thu, 9 Apr 2020 13:18:29 +0200
Subject: Bluetooth: log advertisement packet length if it gets corrected

The error could indicate a problem with the Bluetooth device. It
is easier to investigate if the packet's actual length gets logged,
not just the fact that a discrepancy occurred.

Signed-off-by: Daniels Umanovskis <du@axentia.se>
Reviewed-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 51e6461f0b71..966fc543c01d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5396,7 +5396,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 
 	/* Adjust for actual length */
 	if (len != real_len) {
-		bt_dev_err_ratelimited(hdev, "advertising data len corrected");
+		bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u",
+				       len, real_len);
 		len = real_len;
 	}
 
-- 
cgit v1.2.3


From 849c9c35e80d73c215c65b6023658b371bdeb5ed Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 9 Apr 2020 08:05:48 +0200
Subject: Bluetooth: Use extra variable to make code more readable

When starting active scanning for discovery the whitelist is not needed
to be used. So the filter_policy is 0x00. To make the core more readable
use a variable name instead of just setting 0 as paramter.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_request.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 649e1e5ed446..9ea40106ef17 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -2723,6 +2723,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
 	uint16_t interval = opt;
 	struct hci_dev *hdev = req->hdev;
 	u8 own_addr_type;
+	/* White list is not used for discovery */
+	u8 filter_policy = 0x00;
 	int err;
 
 	BT_DBG("%s", hdev->name);
@@ -2744,7 +2746,7 @@ static int active_scan(struct hci_request *req, unsigned long opt)
 		own_addr_type = ADDR_LE_DEV_PUBLIC;
 
 	hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN,
-			   own_addr_type, 0);
+			   own_addr_type, filter_policy);
 	return 0;
 }
 
-- 
cgit v1.2.3


From ff3b8df2bd758d97aa3dd7c021864be05fec9bd5 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Thu, 9 Apr 2020 08:05:49 +0200
Subject: Bluetooth: Enable LE Enhanced Connection Complete event.

In case LL Privacy is supported by the controller, it is also a good
idea to use the LE Enhanced Connection Complete event for getting all
information about the new connection and its addresses.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h | 1 +
 net/bluetooth/hci_core.c    | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index ff42d05b3e72..1da8cec8e210 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -460,6 +460,7 @@ enum {
 #define HCI_LE_SLAVE_FEATURES		0x08
 #define HCI_LE_PING			0x10
 #define HCI_LE_DATA_LEN_EXT		0x20
+#define HCI_LE_LL_PRIVACY		0x40
 #define HCI_LE_EXT_SCAN_POLICY		0x80
 #define HCI_LE_PHY_2M			0x01
 #define HCI_LE_PHY_CODED		0x08
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 589c4085499c..0d726d59a492 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -638,6 +638,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
 		if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)
 			events[0] |= 0x40;	/* LE Data Length Change */
 
+		/* If the controller supports LL Privacy feature, enable
+		 * the corresponding event.
+		 */
+		if (hdev->le_features[0] & HCI_LE_LL_PRIVACY)
+			events[1] |= 0x02;	/* LE Enhanced Connection
+						 * Complete
+						 */
+
 		/* If the controller supports Extended Scanner Filter
 		 * Policies, enable the correspondig event.
 		 */
-- 
cgit v1.2.3


From 7edc9079540b65026f3d3386b3642d1820d5fed5 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 15 Apr 2020 17:35:16 +0200
Subject: Bluetooth: Enhanced Connection Complete event belongs to LL Privacy

The Enhanced Connection Complete event is use in conjunction with LL
Privacy and not Extended Advertising.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_core.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 0d726d59a492..51d399273276 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -719,14 +719,6 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
 						 * Report
 						 */
 
-		/* If the controller supports the LE Extended Create Connection
-		 * command, enable the corresponding event.
-		 */
-		if (use_ext_conn(hdev))
-			events[1] |= 0x02;      /* LE Enhanced Connection
-						 * Complete
-						 */
-
 		/* If the controller supports the LE Extended Advertising
 		 * command, enable the corresponding event.
 		 */
-- 
cgit v1.2.3


From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 20 Apr 2020 00:11:50 +0200
Subject: net: Add IF_OPER_TESTING

RFC 2863 defines the operational state testing. Add support for this
state, both as a IF_LINK_MODE_ and __LINK_STATE_.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if.h   |  1 +
 net/core/dev.c            |  5 +++++
 net/core/link_watch.c     | 12 ++++++++++--
 net/core/rtnetlink.c      |  9 ++++++++-
 5 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 130a668049ab..0750b54b3765 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -288,6 +288,7 @@ enum netdev_state_t {
 	__LINK_STATE_NOCARRIER,
 	__LINK_STATE_LINKWATCH_PENDING,
 	__LINK_STATE_DORMANT,
+	__LINK_STATE_TESTING,
 };
 
 
@@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev)
 }
 
 
+/**
+ *	netif_testing_on - mark device as under test.
+ *	@dev: network device
+ *
+ * Mark device as under test (as per RFC2863).
+ *
+ * The testing state indicates that some test(s) must be performed on
+ * the interface. After completion, of the test, the interface state
+ * will change to up, dormant, or down, as appropriate.
+ */
+static inline void netif_testing_on(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
+/**
+ *	netif_testing_off - set device as not under test.
+ *	@dev: network device
+ *
+ * Device is not in testing state.
+ */
+static inline void netif_testing_off(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
+/**
+ *	netif_testing - test if device is under test
+ *	@dev: network device
+ *
+ * Check if device is under test
+ */
+static inline bool netif_testing(const struct net_device *dev)
+{
+	return test_bit(__LINK_STATE_TESTING, &dev->state);
+}
+
+
 /**
  *	netif_oper_up - test if device is operational
  *	@dev: network device
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index be714cd8c826..797ba2c1562a 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -178,6 +178,7 @@ enum {
 enum {
 	IF_LINK_MODE_DEFAULT,
 	IF_LINK_MODE_DORMANT,	/* limit upward transition to dormant */
+	IF_LINK_MODE_TESTING,	/* limit upward transition to testing */
 };
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index 522288177bbd..fb61522b1ce1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 	else
 		netif_dormant_off(dev);
 
+	if (rootdev->operstate == IF_OPER_TESTING)
+		netif_testing_on(dev);
+	else
+		netif_testing_off(dev);
+
 	if (netif_carrier_ok(rootdev))
 		netif_carrier_on(dev);
 	else
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index f153e0601838..75431ca9300f 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock);
 
 static unsigned char default_operstate(const struct net_device *dev)
 {
+	if (netif_testing(dev))
+		return IF_OPER_TESTING;
+
 	if (!netif_carrier_ok(dev))
 		return (dev->ifindex != dev_get_iflink(dev) ?
 			IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
@@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 
 	switch(dev->link_mode) {
+	case IF_LINK_MODE_TESTING:
+		if (operstate == IF_OPER_UP)
+			operstate = IF_OPER_TESTING;
+		break;
+
 	case IF_LINK_MODE_DORMANT:
 		if (operstate == IF_OPER_UP)
 			operstate = IF_OPER_DORMANT;
 		break;
-
 	case IF_LINK_MODE_DEFAULT:
 	default:
 		break;
@@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev)
 void linkwatch_init_dev(struct net_device *dev)
 {
 	/* Handle pre-registration link state changes */
-	if (!netif_carrier_ok(dev) || netif_dormant(dev))
+	if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
+	    netif_testing(dev))
 		rfc2863_policy(dev);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 709ebbf8ab5b..d6f4f4a9e8ba 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
 	switch (transition) {
 	case IF_OPER_UP:
 		if ((operstate == IF_OPER_DORMANT ||
+		     operstate == IF_OPER_TESTING ||
 		     operstate == IF_OPER_UNKNOWN) &&
-		    !netif_dormant(dev))
+		    !netif_dormant(dev) && !netif_testing(dev))
 			operstate = IF_OPER_UP;
 		break;
 
+	case IF_OPER_TESTING:
+		if (operstate == IF_OPER_UP ||
+		    operstate == IF_OPER_UNKNOWN)
+			operstate = IF_OPER_TESTING;
+		break;
+
 	case IF_OPER_DORMANT:
 		if (operstate == IF_OPER_UP ||
 		    operstate == IF_OPER_UNKNOWN)
-- 
cgit v1.2.3


From db30a57779b18b7cef092c21887ed2d23ad2bd35 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 20 Apr 2020 00:11:51 +0200
Subject: net: Add testing sysfs attribute

Similar to speed, duplex and dorment, report the testing status
in sysfs.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net | 13 +++++++++++++
 net/core/net-sysfs.c                      | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 664a8f6a634f..3b404577f380 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -124,6 +124,19 @@ Description:
 		authentication is performed (e.g: 802.1x). 'link_mode' attribute
 		will also reflect the dormant state.
 
+What:		/sys/class/net/<iface>/testing
+Date:		April 2002
+KernelVersion:	5.8
+Contact:	netdev@vger.kernel.org
+Description:
+		Indicates whether the interface is under test. Possible
+		values are:
+		0: interface is not being tested
+		1: interface is being tested
+
+		When an interface is under test, it cannot be expected
+		to pass packets as normal.
+
 What:		/sys/clas/net/<iface>/duplex
 Date:		October 2009
 KernelVersion:	2.6.33
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4773ad6ec111..0d9e46de205e 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(duplex);
 
+static ssize_t testing_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	if (netif_running(netdev))
+		return sprintf(buf, fmt_dec, !!netif_testing(netdev));
+
+	return -EINVAL;
+}
+static DEVICE_ATTR_RO(testing);
+
 static ssize_t dormant_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -260,7 +272,7 @@ static const char *const operstates[] = {
 	"notpresent", /* currently unused */
 	"down",
 	"lowerlayerdown",
-	"testing", /* currently unused */
+	"testing",
 	"dormant",
 	"up"
 };
@@ -524,6 +536,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_speed.attr,
 	&dev_attr_duplex.attr,
 	&dev_attr_dormant.attr,
+	&dev_attr_testing.attr,
 	&dev_attr_operstate.attr,
 	&dev_attr_carrier_changes.attr,
 	&dev_attr_ifalias.attr,
-- 
cgit v1.2.3


From 77e9b2ab451d32c81468ef679c377b2f831cd720 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 20 Apr 2020 00:11:52 +0200
Subject: net: ethtool: self_test: Mark interface in testing operative status

When an interface is executing a self test, put the interface into
operative status testing.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 89d0b1827aaf..593fa665f820 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1746,7 +1746,9 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 	if (!data)
 		return -ENOMEM;
 
+	netif_testing_on(dev);
 	ops->self_test(dev, &test, data);
+	netif_testing_off(dev);
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &test, sizeof(test)))
-- 
cgit v1.2.3


From b6246f4d8d0778fd045b84dbd7fc5aadd8f3136e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 20 Apr 2020 22:51:49 +0100
Subject: net: ipv4: remove redundant assignment to variable rc

The variable rc is being assigned with a value that is never read
and it is being updated later with a new value. The initialization is
redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cf58e29cf746..c618e242490f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1914,7 +1914,7 @@ static int __init inet_init(void)
 {
 	struct inet_protosw *q;
 	struct list_head *r;
-	int rc = -EINVAL;
+	int rc;
 
 	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
 
-- 
cgit v1.2.3


From 2a7e978625e887cbb80cb2188cf5f955e4223fa1 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Mon, 13 Apr 2020 20:40:20 +0200
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 2a234d0ad445..61d8dbe8c954 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2020.1"
+#define BATADV_SOURCE_VERSION "2020.2"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3


From c08dd06b3d25e8450a2e187ee6b7412d07c460af Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Thu, 26 Mar 2020 18:37:07 +0100
Subject: batman-adv: Fix spelling error in term buffer

checkpatch warns about a typo in the word bufFer which was introduced in
commit 2191c1bcbc64 ("batman-adv: kernel doc for types.h").

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 4a17a66cc572..d152b8e81f61 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1086,7 +1086,7 @@ struct batadv_priv_bla {
  * struct batadv_priv_debug_log - debug logging data
  */
 struct batadv_priv_debug_log {
-	/** @log_buff: buffer holding the logs (ring bufer) */
+	/** @log_buff: buffer holding the logs (ring buffer) */
 	char log_buff[BATADV_LOG_BUF_LEN];
 
 	/** @log_start: index of next character to read */
-- 
cgit v1.2.3


From 9204a4f876b229505f4ab947124d1160b80bbb4c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 13 Apr 2020 20:26:07 +0200
Subject: batman-adv: trace: Drop unneeded types.h include

The commit 04ae87a52074 ("ftrace: Rework event_create_dir()") restructured
various macros in the ftrace framework. These changes also had the nice
side effect that the linux/types.h include is no longer necessary to define
some of the types used by these macros.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/trace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index f631b1e01b89..a87547570b4e 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -15,7 +15,6 @@
 #include <linux/percpu.h>
 #include <linux/printk.h>
 #include <linux/tracepoint.h>
-#include <linux/types.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM batadv
-- 
cgit v1.2.3


From 26893e7e928e1790852b072edb9bff3c1309e111 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 13 Apr 2020 21:23:29 +0200
Subject: batman-adv: Utilize prandom_u32_max for random [0, max) values

The kernel provides a function to create random values from 0 - (max-1)
since commit f337db64af05 ("random32: add prandom_u32_max and convert open
coded users"). Simply use this function to replace code sections which use
prandom_u32 and a handcrafted method to map it to the correct range.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
---
 net/batman-adv/bat_iv_ogm.c | 4 ++--
 net/batman-adv/bat_v_elp.c  | 2 +-
 net/batman-adv/bat_v_ogm.c  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a7c8dd7ae513..e87f19c82e8d 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -280,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
 	unsigned int msecs;
 
 	msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
-	msecs += prandom_u32() % (2 * BATADV_JITTER);
+	msecs += prandom_u32_max(2 * BATADV_JITTER);
 
 	return jiffies + msecs_to_jiffies(msecs);
 }
@@ -288,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
 /* when do we schedule a ogm packet to be sent */
 static unsigned long batadv_iv_ogm_fwd_send_time(void)
 {
-	return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2));
+	return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2));
 }
 
 /* apply hop penalty for a normal link */
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 1e3172db7492..353e49c40e7f 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -49,7 +49,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
 	unsigned int msecs;
 
 	msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER;
-	msecs += prandom_u32() % (2 * BATADV_JITTER);
+	msecs += prandom_u32_max(2 * BATADV_JITTER);
 
 	queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq,
 			   msecs_to_jiffies(msecs));
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 969466218999..0959d32be65c 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -88,7 +88,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface)
 	unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000;
 
 	/* msecs * [0.9, 1.1] */
-	msecs += prandom_u32() % (msecs / 5) - (msecs / 10);
+	msecs += prandom_u32_max(msecs / 5) - (msecs / 10);
 	queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq,
 			   msecs_to_jiffies(msecs / 1000));
 }
@@ -107,7 +107,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
 		return;
 
 	msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
-	msecs += prandom_u32() % (2 * BATADV_JITTER);
+	msecs += prandom_u32_max(2 * BATADV_JITTER);
 	queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq,
 			   msecs_to_jiffies(msecs));
 }
-- 
cgit v1.2.3


From 1c79031f8a75c132bbd42e3ef20267af97b67466 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 19 Apr 2020 17:18:47 +0300
Subject: drivers: Remove inclusion of vermagic header

Get rid of linux/vermagic.h includes, so that MODULE_ARCH_VERMAGIC from
the arch header arch/x86/include/asm/module.h won't be redefined.

  In file included from ./include/linux/module.h:30,
                   from drivers/net/ethernet/3com/3c515.c:56:
  ./arch/x86/include/asm/module.h:73: warning: "MODULE_ARCH_VERMAGIC"
redefined
     73 | # define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
        |
  In file included from drivers/net/ethernet/3com/3c515.c:25:
  ./include/linux/vermagic.h:28: note: this is the location of the
previous definition
     28 | #define MODULE_ARCH_VERMAGIC ""
        |

Fixes: 6bba2e89a88c ("net/3com: Delete driver and module versions from 3com drivers")
Co-developed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Shannon Nelson <snelson@pensando.io> # ionic
Acked-by: Sebastian Reichel <sre@kernel.org> # power
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bonding_priv.h               | 2 +-
 drivers/net/ethernet/3com/3c509.c                | 1 -
 drivers/net/ethernet/3com/3c515.c                | 1 -
 drivers/net/ethernet/adaptec/starfire.c          | 1 -
 drivers/net/ethernet/pensando/ionic/ionic_main.c | 2 +-
 drivers/power/supply/test_power.c                | 2 +-
 net/ethtool/ioctl.c                              | 3 +--
 7 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h
index 45b77bc8c7b3..48cdf3a49a7d 100644
--- a/drivers/net/bonding/bonding_priv.h
+++ b/drivers/net/bonding/bonding_priv.h
@@ -14,7 +14,7 @@
 
 #ifndef _BONDING_PRIV_H
 #define _BONDING_PRIV_H
-#include <linux/vermagic.h>
+#include <generated/utsrelease.h>
 
 #define DRV_NAME	"bonding"
 #define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c
index b762176a1406..139d0120f511 100644
--- a/drivers/net/ethernet/3com/3c509.c
+++ b/drivers/net/ethernet/3com/3c509.c
@@ -85,7 +85,6 @@
 #include <linux/device.h>
 #include <linux/eisa.h>
 #include <linux/bitops.h>
-#include <linux/vermagic.h>
 
 #include <linux/uaccess.h>
 #include <asm/io.h>
diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c
index 90312fcd6319..47b4215bb93b 100644
--- a/drivers/net/ethernet/3com/3c515.c
+++ b/drivers/net/ethernet/3com/3c515.c
@@ -22,7 +22,6 @@
 
 */
 
-#include <linux/vermagic.h>
 #define DRV_NAME		"3c515"
 
 #define CORKSCREW 1
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 2db42211329f..a64191fc2af9 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -45,7 +45,6 @@
 #include <asm/processor.h>		/* Processor type for cache alignment. */
 #include <linux/uaccess.h>
 #include <asm/io.h>
-#include <linux/vermagic.h>
 
 /*
  * The current frame processor firmware fails to checksum a fragment
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
index 588c62e9add7..3ed150512091 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
@@ -6,7 +6,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/utsname.h>
-#include <linux/vermagic.h>
+#include <generated/utsrelease.h>
 
 #include "ionic.h"
 #include "ionic_bus.h"
diff --git a/drivers/power/supply/test_power.c b/drivers/power/supply/test_power.c
index 65c23ef6408d..b3c05ff05783 100644
--- a/drivers/power/supply/test_power.c
+++ b/drivers/power/supply/test_power.c
@@ -16,7 +16,7 @@
 #include <linux/power_supply.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <linux/vermagic.h>
+#include <generated/utsrelease.h>
 
 enum test_power_id {
 	TEST_AC,
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 593fa665f820..226d5ecdd567 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -17,7 +17,6 @@
 #include <linux/phy.h>
 #include <linux/bitops.h>
 #include <linux/uaccess.h>
-#include <linux/vermagic.h>
 #include <linux/vmalloc.h>
 #include <linux/sfp.h>
 #include <linux/slab.h>
@@ -28,7 +27,7 @@
 #include <net/xdp_sock.h>
 #include <net/flow_offload.h>
 #include <linux/ethtool_netlink.h>
-
+#include <generated/utsrelease.h>
 #include "common.h"
 
 /*
-- 
cgit v1.2.3


From beb97d3a3192c00575580af9073921c6283cf93d Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Tue, 21 Apr 2020 07:55:43 +0800
Subject: net/sched: act_ct: update nf_conn_acct for act_ct SW offload in
 flowtable

When the act_ct SW offload in flowtable, The counter of the conntrack
entry will never update. So update the nf_conn_acct conuter in act_ct
flowtable software offload.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ct.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1a766393be62..9adff83b523b 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -30,6 +30,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
@@ -536,6 +537,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
 	flow_offload_refresh(nf_ft, flow);
 	nf_conntrack_get(&ct->ct_general);
 	nf_ct_set(skb, ct, ctinfo);
+	nf_ct_acct_update(ct, dir, skb->len);
 
 	return true;
 }
-- 
cgit v1.2.3


From 540bde5c2c3da005b87b3edb394d6ca4f890777d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 21 Apr 2020 11:09:12 +0800
Subject: ila: remove unused macro 'ILA_HASH_TABLE_SIZE'

net/ipv6/ila/ila_xlat.c:604:0: warning: macro "ILA_HASH_TABLE_SIZE" is not used [-Wunused-macros]

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ila/ila_xlat.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 5fc1f4e0c0cf..a1ac0e3d8c60 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -601,8 +601,6 @@ out_ret:
 	return ret;
 }
 
-#define ILA_HASH_TABLE_SIZE 1024
-
 int ila_xlat_init_net(struct net *net)
 {
 	struct ila_net *ilan = net_generic(net, ila_net_id);
-- 
cgit v1.2.3


From dfddb54043f0a377f642bd0e6a28aa40769e2e65 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Tue, 21 Apr 2020 13:10:54 +0530
Subject: net: qrtr: Add tracepoint support

Add tracepoint support for QRTR with NS as the first candidate. Later on
this can be extended to core QRTR and transport drivers.

The trace_printk() used in NS has been replaced by tracepoints.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qrtr.h | 115 ++++++++++++++++++++++++++++++++++++++++++++
 net/qrtr/ns.c               |  20 ++++----
 2 files changed, 126 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/events/qrtr.h

(limited to 'net')

diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h
new file mode 100644
index 000000000000..b1de14c3bb93
--- /dev/null
+++ b/include/trace/events/qrtr.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qrtr
+
+#if !defined(_TRACE_QRTR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_QRTR_H
+
+#include <linux/qrtr.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(qrtr_ns_service_announce_new,
+
+	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+
+	TP_ARGS(service, instance, node, port),
+
+	TP_STRUCT__entry(
+		__field(__le32, service)
+		__field(__le32, instance)
+		__field(__le32, node)
+		__field(__le32, port)
+	),
+
+	TP_fast_assign(
+		__entry->service = service;
+		__entry->instance = instance;
+		__entry->node = node;
+		__entry->port = port;
+	),
+
+	TP_printk("advertising new server [%d:%x]@[%d:%d]",
+		  __entry->service, __entry->instance, __entry->node,
+		  __entry->port
+	)
+);
+
+TRACE_EVENT(qrtr_ns_service_announce_del,
+
+	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+
+	TP_ARGS(service, instance, node, port),
+
+	TP_STRUCT__entry(
+		__field(__le32, service)
+		__field(__le32, instance)
+		__field(__le32, node)
+		__field(__le32, port)
+	),
+
+	TP_fast_assign(
+		__entry->service = service;
+		__entry->instance = instance;
+		__entry->node = node;
+		__entry->port = port;
+	),
+
+	TP_printk("advertising removal of server [%d:%x]@[%d:%d]",
+		  __entry->service, __entry->instance, __entry->node,
+		  __entry->port
+	)
+);
+
+TRACE_EVENT(qrtr_ns_server_add,
+
+	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+
+	TP_ARGS(service, instance, node, port),
+
+	TP_STRUCT__entry(
+		__field(__le32, service)
+		__field(__le32, instance)
+		__field(__le32, node)
+		__field(__le32, port)
+	),
+
+	TP_fast_assign(
+		__entry->service = service;
+		__entry->instance = instance;
+		__entry->node = node;
+		__entry->port = port;
+	),
+
+	TP_printk("add server [%d:%x]@[%d:%d]",
+		  __entry->service, __entry->instance, __entry->node,
+		  __entry->port
+	)
+);
+
+TRACE_EVENT(qrtr_ns_message,
+
+	TP_PROTO(const char * const ctrl_pkt_str, __u32 sq_node, __u32 sq_port),
+
+	TP_ARGS(ctrl_pkt_str, sq_node, sq_port),
+
+	TP_STRUCT__entry(
+		__string(ctrl_pkt_str, ctrl_pkt_str)
+		__field(__u32, sq_node)
+		__field(__u32, sq_port)
+	),
+
+	TP_fast_assign(
+		__assign_str(ctrl_pkt_str, ctrl_pkt_str);
+		__entry->sq_node = sq_node;
+		__entry->sq_port = sq_port;
+	),
+
+	TP_printk("%s from %d:%d",
+		  __get_str(ctrl_pkt_str), __entry->sq_node, __entry->sq_port
+	)
+);
+
+#endif /* _TRACE_QRTR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index e7d0fe3f4330..3ca196fc7f9b 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -12,6 +12,9 @@
 
 #include "qrtr.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/qrtr.h>
+
 static RADIX_TREE(nodes, GFP_KERNEL);
 
 static struct {
@@ -105,8 +108,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest,
 	struct msghdr msg = { };
 	struct kvec iv;
 
-	trace_printk("advertising new server [%d:%x]@[%d:%d]\n",
-		     srv->service, srv->instance, srv->node, srv->port);
+	trace_qrtr_ns_service_announce_new(srv->service, srv->instance,
+					   srv->node, srv->port);
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -132,8 +135,8 @@ static int service_announce_del(struct sockaddr_qrtr *dest,
 	struct kvec iv;
 	int ret;
 
-	trace_printk("advertising removal of server [%d:%x]@[%d:%d]\n",
-		     srv->service, srv->instance, srv->node, srv->port);
+	trace_qrtr_ns_service_announce_del(srv->service, srv->instance,
+					   srv->node, srv->port);
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -244,8 +247,8 @@ static struct qrtr_server *server_add(unsigned int service,
 
 	radix_tree_insert(&node->servers, port, srv);
 
-	trace_printk("add server [%d:%x]@[%d:%d]\n", srv->service,
-		     srv->instance, srv->node, srv->port);
+	trace_qrtr_ns_server_add(srv->service, srv->instance,
+				 srv->node, srv->port);
 
 	return srv;
 
@@ -633,9 +636,8 @@ static void qrtr_ns_worker(struct work_struct *work)
 		cmd = le32_to_cpu(pkt->cmd);
 		if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) &&
 		    qrtr_ctrl_pkt_strings[cmd])
-			trace_printk("%s from %d:%d\n",
-				     qrtr_ctrl_pkt_strings[cmd], sq.sq_node,
-				     sq.sq_port);
+			trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd],
+					      sq.sq_node, sq.sq_port);
 
 		ret = 0;
 		switch (cmd) {
-- 
cgit v1.2.3


From 8518307dc2b2a97b235c74920b45020f9bebe33e Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Wed, 22 Apr 2020 15:16:36 +0800
Subject: net: caif: use true,false for bool variables

Fix the following coccicheck warning:

net/caif/caif_dev.c:410:2-13: WARNING: Assignment of 0/1 to bool
variable
net/caif/caif_dev.c:445:2-13: WARNING: Assignment of 0/1 to bool
variable
net/caif/caif_dev.c:145:1-12: WARNING: Assignment of 0/1 to bool
variable
net/caif/caif_dev.c:223:1-12: WARNING: Assignment of 0/1 to bool
variable

Signed-off-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 195d2d67be8a..c10e5a55758d 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -142,7 +142,7 @@ static void caif_flow_cb(struct sk_buff *skb)
 
 	spin_lock_bh(&caifd->flow_lock);
 	send_xoff = caifd->xoff;
-	caifd->xoff = 0;
+	caifd->xoff = false;
 	dtor = caifd->xoff_skb_dtor;
 
 	if (WARN_ON(caifd->xoff_skb != skb))
@@ -220,7 +220,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt)
 	pr_debug("queue has stopped(%d) or is full (%d > %d)\n",
 			netif_queue_stopped(caifd->netdev),
 			qlen, high);
-	caifd->xoff = 1;
+	caifd->xoff = true;
 	caifd->xoff_skb = skb;
 	caifd->xoff_skb_dtor = skb->destructor;
 	skb->destructor = caif_flow_cb;
@@ -407,7 +407,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 			break;
 		}
 
-		caifd->xoff = 0;
+		caifd->xoff = false;
 		cfcnfg_set_phy_state(cfg, &caifd->layer, true);
 		rcu_read_unlock();
 
@@ -442,7 +442,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 		if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL)
 			caifd->xoff_skb->destructor = caifd->xoff_skb_dtor;
 
-		caifd->xoff = 0;
+		caifd->xoff = false;
 		caifd->xoff_skb_dtor = NULL;
 		caifd->xoff_skb = NULL;
 
-- 
cgit v1.2.3


From b75326c201242de9495ff98e5d5cff41d7fc0d9d Mon Sep 17 00:00:00 2001
From: Fernando Gont <fgont@si6networks.com>
Date: Sun, 19 Apr 2020 09:24:57 -0300
Subject: ipv6: Honor all IPv6 PIO Valid Lifetime values

RFC4862 5.5.3 e) prevents received Router Advertisements from reducing
the Valid Lifetime of configured addresses to less than two hours, thus
preventing hosts from reacting to the information provided by a router
that has positive knowledge that a prefix has become invalid.

This patch makes hosts honor all Valid Lifetime values, as per
draft-gont-6man-slaac-renum-06, Section 4.2. This is meant to help
mitigate the problem discussed in draft-ietf-v6ops-slaac-renum.

Note: Attacks aiming at disabling an advertised prefix via a Valid
Lifetime of 0 are not really more harmful than other attacks
that can be performed via forged RA messages, such as those
aiming at completely disabling a next-hop router via an RA that
advertises a Router Lifetime of 0, or performing a Denial of
Service (DoS) attack by advertising illegitimate prefixes via
forged PIOs.  In scenarios where RA-based attacks are of concern,
proper mitigations such as RA-Guard [RFC6105] [RFC7113] should
be implemented.

Signed-off-by: Fernando Gont <fgont@si6networks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  2 --
 net/ipv6/addrconf.c    | 27 +++++++--------------------
 2 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index e0eabe58aa8b..fdb07105384c 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -6,8 +6,6 @@
 #define RTR_SOLICITATION_INTERVAL	(4*HZ)
 #define RTR_SOLICITATION_MAX_INTERVAL	(3600*HZ)	/* 1 hour */
 
-#define MIN_VALID_LIFETIME		(2*3600)	/* 2 hours */
-
 #define TEMP_VALID_LIFETIME		(7*86400)
 #define TEMP_PREFERRED_LIFETIME		(86400)
 #define REGEN_MAX_RETRY			(3)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 24e319dfb510..27b4fb6e452b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2564,7 +2564,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 				 __u32 valid_lft, u32 prefered_lft)
 {
 	struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
-	int create = 0, update_lft = 0;
+	int create = 0;
 
 	if (!ifp && valid_lft) {
 		int max_addresses = in6_dev->cnf.max_addresses;
@@ -2608,32 +2608,19 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 		unsigned long now;
 		u32 stored_lft;
 
-		/* update lifetime (RFC2462 5.5.3 e) */
+		/* Update lifetime (RFC4862 5.5.3 e)
+		 * We deviate from RFC4862 by honoring all Valid Lifetimes to
+		 * improve the reaction of SLAAC to renumbering events
+		 * (draft-gont-6man-slaac-renum-06, Section 4.2)
+		 */
 		spin_lock_bh(&ifp->lock);
 		now = jiffies;
 		if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
 			stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
 		else
 			stored_lft = 0;
-		if (!create && stored_lft) {
-			const u32 minimum_lft = min_t(u32,
-				stored_lft, MIN_VALID_LIFETIME);
-			valid_lft = max(valid_lft, minimum_lft);
-
-			/* RFC4862 Section 5.5.3e:
-			 * "Note that the preferred lifetime of the
-			 *  corresponding address is always reset to
-			 *  the Preferred Lifetime in the received
-			 *  Prefix Information option, regardless of
-			 *  whether the valid lifetime is also reset or
-			 *  ignored."
-			 *
-			 * So we should always update prefered_lft here.
-			 */
-			update_lft = 1;
-		}
 
-		if (update_lft) {
+		if (!create && stored_lft) {
 			ifp->valid_lft = valid_lft;
 			ifp->prefered_lft = prefered_lft;
 			ifp->tstamp = now;
-- 
cgit v1.2.3


From e131a5634830047923c694b4ce0c3b31745ff01b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <bloodyreaper@yandex.ru>
Date: Tue, 21 Apr 2020 16:41:08 +0300
Subject: net: dsa: add GRO support via gro_cells

gro_cells lib is used by different encapsulating netdevices, such as
geneve, macsec, vxlan etc. to speed up decapsulated traffic processing.
CPU tag is a sort of "encapsulation", and we can use the same mechs to
greatly improve overall DSA performance.
skbs are passed to the GRO layer after removing CPU tags, so we don't
need any new packet offload types as it was firstly proposed by me in
the first GRO-over-DSA variant [1].

The size of struct gro_cells is sizeof(void *), so hot struct
dsa_slave_priv becomes only 4/8 bytes bigger, and all critical fields
remain in one 32-byte cacheline.
The other positive side effect is that drivers for network devices
that can be shipped as CPU ports of DSA-driven switches can now use
napi_gro_frags() to pass skbs to kernel. Packets built that way are
completely non-linear and are likely being dropped without GRO.

This was tested on to-be-mainlined-soon Ethernet driver that uses
napi_gro_frags(), and the overall performance was on par with the
variant from [1], sometimes even better due to minimal overhead.
net.core.gro_normal_batch tuning may help to push it to the limit
on particular setups and platforms.

iperf3 IPoE VLAN NAT TCP forwarding (port1.218 -> port0) setup
on 1.2 GHz MIPS board:

5.7-rc2 baseline:

[ID]  Interval         Transfer     Bitrate        Retr
[ 5]  0.00-120.01 sec  9.00 GBytes  644 Mbits/sec  413  sender
[ 5]  0.00-120.00 sec  8.99 GBytes  644 Mbits/sec       receiver

Iface      RX packets  TX packets
eth0       7097731     7097702
port0      426050      6671829
port1      6671681     425862
port1.218  6671677     425851

With this patch:

[ID]  Interval         Transfer     Bitrate        Retr
[ 5]  0.00-120.01 sec  12.2 GBytes  870 Mbits/sec  122  sender
[ 5]  0.00-120.00 sec  12.2 GBytes  870 Mbits/sec       receiver

Iface      RX packets  TX packets
eth0       9474792     9474777
port0      455200      353288
port1      9019592     455035
port1.218  353144      455024

v2:
 - Add some performance examples in the commit message;
 - No functional changes.

[1] https://lore.kernel.org/netdev/20191230143028.27313-1-alobakin@dlink.ru/

Signed-off-by: Alexander Lobakin <bloodyreaper@yandex.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/Kconfig    |  1 +
 net/dsa/dsa.c      |  2 +-
 net/dsa/dsa_priv.h |  3 +++
 net/dsa/slave.c    | 10 +++++++++-
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 92663dcb3aa2..739613070d07 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,6 +9,7 @@ menuconfig NET_DSA
 	tristate "Distributed Switch Architecture"
 	depends on HAVE_NET_DSA
 	depends on BRIDGE || BRIDGE=n
+	select GRO_CELLS
 	select NET_SWITCHDEV
 	select PHYLINK
 	select NET_DEVLINK
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index ee2610c4d46a..0384a911779e 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -234,7 +234,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (dsa_skb_defer_rx_timestamp(p, skb))
 		return 0;
 
-	netif_receive_skb(skb);
+	gro_cells_receive(&p->gcells, skb);
 
 	return 0;
 }
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 904cc7c9b882..6d9a1ef65fa0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/netpoll.h>
 #include <net/dsa.h>
+#include <net/gro_cells.h>
 
 enum {
 	DSA_NOTIFIER_AGEING_TIME,
@@ -77,6 +78,8 @@ struct dsa_slave_priv {
 
 	struct pcpu_sw_netstats	*stats64;
 
+	struct gro_cells	gcells;
+
 	/* DSA port data, such as switch, port index, etc. */
 	struct dsa_port		*dp;
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index e94eb1aac602..f2c241cf3a80 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1762,6 +1762,11 @@ int dsa_slave_create(struct dsa_port *port)
 		free_netdev(slave_dev);
 		return -ENOMEM;
 	}
+
+	ret = gro_cells_init(&p->gcells, slave_dev);
+	if (ret)
+		goto out_free;
+
 	p->dp = port;
 	INIT_LIST_HEAD(&p->mall_tc_list);
 	p->xmit = cpu_dp->tag_ops->xmit;
@@ -1781,7 +1786,7 @@ int dsa_slave_create(struct dsa_port *port)
 	ret = dsa_slave_phy_setup(slave_dev);
 	if (ret) {
 		netdev_err(master, "error %d setting up slave phy\n", ret);
-		goto out_free;
+		goto out_gcells;
 	}
 
 	dsa_slave_notify(slave_dev, DSA_PORT_REGISTER);
@@ -1800,6 +1805,8 @@ out_phy:
 	phylink_disconnect_phy(p->dp->pl);
 	rtnl_unlock();
 	phylink_destroy(p->dp->pl);
+out_gcells:
+	gro_cells_destroy(&p->gcells);
 out_free:
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
@@ -1820,6 +1827,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 	dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
 	unregister_netdev(slave_dev);
 	phylink_destroy(dp->pl);
+	gro_cells_destroy(&p->gcells);
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
 }
-- 
cgit v1.2.3


From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Apr 2020 09:13:27 -0700
Subject: net: napi: add hard irqs deferral feature

Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer")
we added the ability to arm one high resolution timer, that we used
to keep not-complete packets in GRO engine a bit longer, hoping that further
frames might be added to them.

Since then, we added the napi_complete_done() interface, and commit
364b6055738b ("net: busy-poll: return busypolling status to drivers")
allowed drivers to avoid re-arming NIC interrupts if we made a promise
that their NAPI poll() handler would be called in the near future.

This infrastructure can be leveraged, thanks to a new device parameter,
which allows to arm the napi hrtimer, instead of re-arming the device
hard IRQ.

We have noticed that on some servers with 32 RX queues or more, the chit-chat
between the NIC and the host caused by IRQ delivery and re-arming could hurt
throughput by ~20% on 100Gbit NIC.

In contrast, hrtimers are using local (percpu) resources and might have lower
cost.

The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy
than gro_flush_timeout (/sys/class/net/ethX/)

By default, both gro_flush_timeout and napi_defer_hard_irqs are zero.

This patch does not change the prior behavior of gro_flush_timeout
if used alone : NIC hard irqs should be rearmed as before.

One concrete usage can be :

echo 20000 >/sys/class/net/eth1/gro_flush_timeout
echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs

If at least one packet is retired, then we will reset napi counter
to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans
of the queue.

On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ
avoidance was only possible if napi->poll() was exhausting its budget
and not call napi_complete_done().

This feature also can be used to work around some non-optimal NIC irq
coalescing strategies.

Having the ability to insert XX usec delays between each napi->poll()
can increase cache efficiency, since we increase batch sizes.

It also keeps serving cpus not idle too long, reducing tail latencies.

Co-developed-by: Luigi Rizzo <lrizzo@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 29 ++++++++++++++++++-----------
 net/core/net-sysfs.c      | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0750b54b3765..5a8d40f1ffe2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -329,6 +329,7 @@ struct napi_struct {
 
 	unsigned long		state;
 	int			weight;
+	int			defer_hard_irqs_count;
 	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
@@ -1995,6 +1996,7 @@ struct net_device {
 
 	struct bpf_prog __rcu	*xdp_prog;
 	unsigned long		gro_flush_timeout;
+	int			napi_defer_hard_irqs;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index fb61522b1ce1..67585484ad32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
 
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
-	unsigned long flags, val, new;
+	unsigned long flags, val, new, timeout = 0;
+	bool ret = true;
 
 	/*
 	 * 1) Don't let napi dequeue from the cpu poll list
@@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (n->gro_bitmask) {
-		unsigned long timeout = 0;
-
-		if (work_done)
+	if (work_done) {
+		if (n->gro_bitmask)
 			timeout = n->dev->gro_flush_timeout;
-
+		n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs;
+	}
+	if (n->defer_hard_irqs_count > 0) {
+		n->defer_hard_irqs_count--;
+		timeout = n->dev->gro_flush_timeout;
+		if (timeout)
+			ret = false;
+	}
+	if (n->gro_bitmask) {
 		/* When the NAPI instance uses a timeout and keeps postponing
 		 * it, we need to bound somehow the time packets are kept in
 		 * the GRO layer
 		 */
 		napi_gro_flush(n, !!timeout);
-		if (timeout)
-			hrtimer_start(&n->timer, ns_to_ktime(timeout),
-				      HRTIMER_MODE_REL_PINNED);
 	}
 
 	gro_normal_list(n);
@@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		return false;
 	}
 
-	return true;
+	if (timeout)
+		hrtimer_start(&n->timer, ns_to_ktime(timeout),
+			      HRTIMER_MODE_REL_PINNED);
+	return ret;
 }
 EXPORT_SYMBOL(napi_complete_done);
 
@@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
+	if (!napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 0d9e46de205e..f3b650cd0923 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
 
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+	dev->napi_defer_hard_irqs = val;
+	return 0;
+}
+
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t len)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+
 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_flags.attr,
 	&dev_attr_tx_queue_len.attr,
 	&dev_attr_gro_flush_timeout.attr,
+	&dev_attr_napi_defer_hard_irqs.attr,
 	&dev_attr_phys_port_id.attr,
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
-- 
cgit v1.2.3


From 7e417a66b86c110f4b282945dac82e21e0b08328 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Apr 2020 09:13:28 -0700
Subject: net: napi: use READ_ONCE()/WRITE_ONCE()

gro_flush_timeout and napi_defer_hard_irqs can be read
from napi_complete_done() while other cpus write the value,
whithout explicit synchronization.

Use READ_ONCE()/WRITE_ONCE() to annotate the races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 6 +++---
 net/core/net-sysfs.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 67585484ad32..afff16849c26 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6242,12 +6242,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 
 	if (work_done) {
 		if (n->gro_bitmask)
-			timeout = n->dev->gro_flush_timeout;
-		n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs;
+			timeout = READ_ONCE(n->dev->gro_flush_timeout);
+		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 	}
 	if (n->defer_hard_irqs_count > 0) {
 		n->defer_hard_irqs_count--;
-		timeout = n->dev->gro_flush_timeout;
+		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 		if (timeout)
 			ret = false;
 	}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f3b650cd0923..880e89c894f6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
 
 static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
 {
-	dev->gro_flush_timeout = val;
+	WRITE_ONCE(dev->gro_flush_timeout, val);
 	return 0;
 }
 
@@ -384,7 +384,7 @@ NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
 
 static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
 {
-	dev->napi_defer_hard_irqs = val;
+	WRITE_ONCE(dev->napi_defer_hard_irqs, val);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3c9143d96852485725d330b9164062d8f2d90a38 Mon Sep 17 00:00:00 2001
From: Xu Wang <vulab@iscas.ac.cn>
Date: Thu, 23 Apr 2020 13:43:13 +0800
Subject: net: sched : Remove unnecessary cast in kfree

Remove unnecassary casts in the argument to kfree.

Signed-off-by: Xu Wang <vulab@iscas.ac.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_ipt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index eecfe072c508..18755d29fd15 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -199,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em)
 		im->match->destroy(&par);
 	}
 	module_put(im->match->me);
-	kfree((void *)im);
+	kfree(im);
 }
 
 static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
-- 
cgit v1.2.3


From c7c4c44c9a95d87e50ced38f7480e779cb472174 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 24 Apr 2020 08:08:02 +0800
Subject: net: openvswitch: expand the meters supported number

In kernel datapath of Open vSwitch, there are only 1024
buckets of meter in one datapath. If installing more than
1024 (e.g. 8192) meters, it may lead to the performance drop.
But in some case, for example, Open vSwitch used as edge
gateway, there should be 20K at least, where meters used for
IP address bandwidth limitation.

[Open vSwitch userspace datapath has this issue too.]

For more scalable meter, this patch use meter array instead of
hash tables, and expand/shrink the array when necessary. So we
can install more meters than before in the datapath.
Introducing the struct *dp_meter_instance, it's easy to
expand meter though changing the *ti point in the struct
*dp_meter_table.

Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.h |   2 +-
 net/openvswitch/meter.c    | 240 ++++++++++++++++++++++++++++++++++-----------
 net/openvswitch/meter.h    |  16 ++-
 3 files changed, 195 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index e239a46c2f94..2016dd107939 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -82,7 +82,7 @@ struct datapath {
 	u32 max_headroom;
 
 	/* Switch meters. */
-	struct hlist_head *meters;
+	struct dp_meter_table meter_tbl;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 5010d1ddd4bd..f806ded1dd0a 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -19,8 +19,6 @@
 #include "datapath.h"
 #include "meter.h"
 
-#define METER_HASH_BUCKETS 1024
-
 static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
 	[OVS_METER_ATTR_ID] = { .type = NLA_U32, },
 	[OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
@@ -39,6 +37,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
 	[OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
 };
 
+static u32 meter_hash(struct dp_meter_instance *ti, u32 id)
+{
+	return id % ti->n_meters;
+}
+
 static void ovs_meter_free(struct dp_meter *meter)
 {
 	if (!meter)
@@ -47,40 +50,153 @@ static void ovs_meter_free(struct dp_meter *meter)
 	kfree_rcu(meter, rcu);
 }
 
-static struct hlist_head *meter_hash_bucket(const struct datapath *dp,
-					    u32 meter_id)
-{
-	return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)];
-}
-
 /* Call with ovs_mutex or RCU read lock. */
-static struct dp_meter *lookup_meter(const struct datapath *dp,
+static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl,
 				     u32 meter_id)
 {
+	struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+	u32 hash = meter_hash(ti, meter_id);
 	struct dp_meter *meter;
-	struct hlist_head *head;
 
-	head = meter_hash_bucket(dp, meter_id);
-	hlist_for_each_entry_rcu(meter, head, dp_hash_node,
-				lockdep_ovsl_is_held()) {
-		if (meter->id == meter_id)
-			return meter;
-	}
+	meter = rcu_dereference_ovsl(ti->dp_meters[hash]);
+	if (meter && likely(meter->id == meter_id))
+		return meter;
+
 	return NULL;
 }
 
-static void attach_meter(struct datapath *dp, struct dp_meter *meter)
+static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size)
+{
+	struct dp_meter_instance *ti;
+
+	ti = kvzalloc(sizeof(*ti) +
+		      sizeof(struct dp_meter *) * size,
+		      GFP_KERNEL);
+	if (!ti)
+		return NULL;
+
+	ti->n_meters = size;
+
+	return ti;
+}
+
+static void dp_meter_instance_free(struct dp_meter_instance *ti)
+{
+	kvfree(ti);
+}
+
+static void dp_meter_instance_free_rcu(struct rcu_head *rcu)
+{
+	struct dp_meter_instance *ti;
+
+	ti = container_of(rcu, struct dp_meter_instance, rcu);
+	kvfree(ti);
+}
+
+static int
+dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size)
+{
+	struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+	int n_meters = min(size, ti->n_meters);
+	struct dp_meter_instance *new_ti;
+	int i;
+
+	new_ti = dp_meter_instance_alloc(size);
+	if (!new_ti)
+		return -ENOMEM;
+
+	for (i = 0; i < n_meters; i++)
+		new_ti->dp_meters[i] =
+			rcu_dereference_ovsl(ti->dp_meters[i]);
+
+	rcu_assign_pointer(tbl->ti, new_ti);
+	call_rcu(&ti->rcu, dp_meter_instance_free_rcu);
+
+	return 0;
+}
+
+static void dp_meter_instance_insert(struct dp_meter_instance *ti,
+				     struct dp_meter *meter)
+{
+	u32 hash;
+
+	hash = meter_hash(ti, meter->id);
+	rcu_assign_pointer(ti->dp_meters[hash], meter);
+}
+
+static void dp_meter_instance_remove(struct dp_meter_instance *ti,
+				     struct dp_meter *meter)
 {
-	struct hlist_head *head = meter_hash_bucket(dp, meter->id);
+	u32 hash;
 
-	hlist_add_head_rcu(&meter->dp_hash_node, head);
+	hash = meter_hash(ti, meter->id);
+	RCU_INIT_POINTER(ti->dp_meters[hash], NULL);
 }
 
-static void detach_meter(struct dp_meter *meter)
+static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
 {
+	struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
+	u32 hash = meter_hash(ti, meter->id);
+
+	/* In generally, slots selected should be empty, because
+	 * OvS uses id-pool to fetch a available id.
+	 */
+	if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash])))
+		return -EBUSY;
+
+	dp_meter_instance_insert(ti, meter);
+
+	/* That function is thread-safe. */
+	if (++tbl->count >= ti->n_meters)
+		if (dp_meter_instance_realloc(tbl, ti->n_meters * 2))
+			goto expand_err;
+
+	return 0;
+
+expand_err:
+	dp_meter_instance_remove(ti, meter);
+	tbl->count--;
+	return -ENOMEM;
+}
+
+static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
+{
+	struct dp_meter_instance *ti;
+
 	ASSERT_OVSL();
-	if (meter)
-		hlist_del_rcu(&meter->dp_hash_node);
+	if (!meter)
+		return 0;
+
+	ti = rcu_dereference_ovsl(tbl->ti);
+	dp_meter_instance_remove(ti, meter);
+
+	tbl->count--;
+
+	/* Shrink the meter array if necessary. */
+	if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN &&
+	    tbl->count <= (ti->n_meters / 4)) {
+		int half_size = ti->n_meters / 2;
+		int i;
+
+		/* Avoid hash collision, don't move slots to other place.
+		 * Make sure there are no references of meters in array
+		 * which will be released.
+		 */
+		for (i = half_size; i < ti->n_meters; i++)
+			if (rcu_dereference_ovsl(ti->dp_meters[i]))
+				goto out;
+
+		if (dp_meter_instance_realloc(tbl, half_size))
+			goto shrink_err;
+	}
+
+out:
+	return 0;
+
+shrink_err:
+	dp_meter_instance_insert(ti, meter);
+	tbl->count++;
+	return -ENOMEM;
 }
 
 static struct sk_buff *
@@ -273,6 +389,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	struct sk_buff *reply;
 	struct ovs_header *ovs_reply_header;
 	struct ovs_header *ovs_header = info->userhdr;
+	struct dp_meter_table *meter_tbl;
 	struct datapath *dp;
 	int err;
 	u32 meter_id;
@@ -300,12 +417,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock;
 	}
 
+	meter_tbl = &dp->meter_tbl;
 	meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
 
-	/* Cannot fail after this. */
-	old_meter = lookup_meter(dp, meter_id);
-	detach_meter(old_meter);
-	attach_meter(dp, meter);
+	old_meter = lookup_meter(meter_tbl, meter_id);
+	err = detach_meter(meter_tbl, old_meter);
+	if (err)
+		goto exit_unlock;
+
+	err = attach_meter(meter_tbl, meter);
+	if (err)
+		goto exit_unlock;
+
 	ovs_unlock();
 
 	/* Build response with the meter_id and stats from
@@ -337,14 +460,14 @@ exit_free_meter:
 
 static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr **a = info->attrs;
-	u32 meter_id;
 	struct ovs_header *ovs_header = info->userhdr;
 	struct ovs_header *ovs_reply_header;
+	struct nlattr **a = info->attrs;
+	struct dp_meter *meter;
+	struct sk_buff *reply;
 	struct datapath *dp;
+	u32 meter_id;
 	int err;
-	struct sk_buff *reply;
-	struct dp_meter *meter;
 
 	if (!a[OVS_METER_ATTR_ID])
 		return -EINVAL;
@@ -365,7 +488,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	/* Locate meter, copy stats. */
-	meter = lookup_meter(dp, meter_id);
+	meter = lookup_meter(&dp->meter_tbl, meter_id);
 	if (!meter) {
 		err = -ENOENT;
 		goto exit_unlock;
@@ -390,18 +513,17 @@ exit_unlock:
 
 static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr **a = info->attrs;
-	u32 meter_id;
 	struct ovs_header *ovs_header = info->userhdr;
 	struct ovs_header *ovs_reply_header;
+	struct nlattr **a = info->attrs;
+	struct dp_meter *old_meter;
+	struct sk_buff *reply;
 	struct datapath *dp;
+	u32 meter_id;
 	int err;
-	struct sk_buff *reply;
-	struct dp_meter *old_meter;
 
 	if (!a[OVS_METER_ATTR_ID])
 		return -EINVAL;
-	meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
 
 	reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL,
 					  &ovs_reply_header);
@@ -416,14 +538,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock;
 	}
 
-	old_meter = lookup_meter(dp, meter_id);
+	meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
+	old_meter = lookup_meter(&dp->meter_tbl, meter_id);
 	if (old_meter) {
 		spin_lock_bh(&old_meter->lock);
 		err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter);
 		WARN_ON(err);
 		spin_unlock_bh(&old_meter->lock);
-		detach_meter(old_meter);
+
+		err = detach_meter(&dp->meter_tbl, old_meter);
+		if (err)
+			goto exit_unlock;
 	}
+
 	ovs_unlock();
 	ovs_meter_free(old_meter);
 	genlmsg_end(reply, ovs_reply_header);
@@ -443,16 +570,16 @@ exit_unlock:
 bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb,
 		       struct sw_flow_key *key, u32 meter_id)
 {
-	struct dp_meter *meter;
-	struct dp_meter_band *band;
 	long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000);
 	long long int long_delta_ms;
-	u32 delta_ms;
-	u32 cost;
+	struct dp_meter_band *band;
+	struct dp_meter *meter;
 	int i, band_exceeded_max = -1;
 	u32 band_exceeded_rate = 0;
+	u32 delta_ms;
+	u32 cost;
 
-	meter = lookup_meter(dp, meter_id);
+	meter = lookup_meter(&dp->meter_tbl, meter_id);
 	/* Do not drop the packet when there is no meter. */
 	if (!meter)
 		return false;
@@ -570,32 +697,27 @@ struct genl_family dp_meter_genl_family __ro_after_init = {
 
 int ovs_meters_init(struct datapath *dp)
 {
-	int i;
+	struct dp_meter_table *tbl = &dp->meter_tbl;
+	struct dp_meter_instance *ti;
 
-	dp->meters = kmalloc_array(METER_HASH_BUCKETS,
-				   sizeof(struct hlist_head), GFP_KERNEL);
-
-	if (!dp->meters)
+	ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
+	if (!ti)
 		return -ENOMEM;
 
-	for (i = 0; i < METER_HASH_BUCKETS; i++)
-		INIT_HLIST_HEAD(&dp->meters[i]);
+	rcu_assign_pointer(tbl->ti, ti);
+	tbl->count = 0;
 
 	return 0;
 }
 
 void ovs_meters_exit(struct datapath *dp)
 {
+	struct dp_meter_table *tbl = &dp->meter_tbl;
+	struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti);
 	int i;
 
-	for (i = 0; i < METER_HASH_BUCKETS; i++) {
-		struct hlist_head *head = &dp->meters[i];
-		struct dp_meter *meter;
-		struct hlist_node *n;
-
-		hlist_for_each_entry_safe(meter, n, head, dp_hash_node)
-			kfree(meter);
-	}
+	for (i = 0; i < ti->n_meters; i++)
+		ovs_meter_free(ti->dp_meters[i]);
 
-	kfree(dp->meters);
+	dp_meter_instance_free(ti);
 }
diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h
index f645913870bd..f52052d30a16 100644
--- a/net/openvswitch/meter.h
+++ b/net/openvswitch/meter.h
@@ -13,11 +13,13 @@
 #include <linux/openvswitch.h>
 #include <linux/genetlink.h>
 #include <linux/skbuff.h>
+#include <linux/bits.h>
 
 #include "flow.h"
 struct datapath;
 
 #define DP_MAX_BANDS		1
+#define DP_METER_ARRAY_SIZE_MIN	BIT_ULL(10)
 
 struct dp_meter_band {
 	u32 type;
@@ -30,9 +32,6 @@ struct dp_meter_band {
 struct dp_meter {
 	spinlock_t lock;    /* Per meter lock */
 	struct rcu_head rcu;
-	struct hlist_node dp_hash_node; /*Element in datapath->meters
-					 * hash table.
-					 */
 	u32 id;
 	u16 kbps:1, keep_stats:1;
 	u16 n_bands;
@@ -42,6 +41,17 @@ struct dp_meter {
 	struct dp_meter_band bands[];
 };
 
+struct dp_meter_instance {
+	struct rcu_head rcu;
+	u32 n_meters;
+	struct dp_meter __rcu *dp_meters[];
+};
+
+struct dp_meter_table {
+	struct dp_meter_instance __rcu *ti;
+	u32 count;
+};
+
 extern struct genl_family dp_meter_genl_family;
 int ovs_meters_init(struct datapath *dp);
 void ovs_meters_exit(struct datapath *dp);
-- 
cgit v1.2.3


From eb58eebc7fb5e23c9cc7d557c0a9236630591526 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 24 Apr 2020 08:08:03 +0800
Subject: net: openvswitch: set max limitation to meters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't allow user to create meter unlimitedly, which may cause
to consume a large amount of kernel memory. The max number
supported is decided by physical memory and 20K meters as default.

Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 57 ++++++++++++++++++++++++++++++++++++++++---------
 net/openvswitch/meter.h |  2 ++
 2 files changed, 49 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index f806ded1dd0a..372f4565872d 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -12,6 +12,7 @@
 #include <linux/openvswitch.h>
 #include <linux/netlink.h>
 #include <linux/rculist.h>
+#include <linux/swap.h>
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
@@ -137,6 +138,7 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
 {
 	struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti);
 	u32 hash = meter_hash(ti, meter->id);
+	int err;
 
 	/* In generally, slots selected should be empty, because
 	 * OvS uses id-pool to fetch a available id.
@@ -147,16 +149,24 @@ static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
 	dp_meter_instance_insert(ti, meter);
 
 	/* That function is thread-safe. */
-	if (++tbl->count >= ti->n_meters)
-		if (dp_meter_instance_realloc(tbl, ti->n_meters * 2))
-			goto expand_err;
+	tbl->count++;
+	if (tbl->count >= tbl->max_meters_allowed) {
+		err = -EFBIG;
+		goto attach_err;
+	}
+
+	if (tbl->count >= ti->n_meters &&
+	    dp_meter_instance_realloc(tbl, ti->n_meters * 2)) {
+		err = -ENOMEM;
+		goto attach_err;
+	}
 
 	return 0;
 
-expand_err:
+attach_err:
 	dp_meter_instance_remove(ti, meter);
 	tbl->count--;
-	return -ENOMEM;
+	return err;
 }
 
 static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter)
@@ -266,18 +276,32 @@ error:
 
 static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info)
 {
-	struct sk_buff *reply;
+	struct ovs_header *ovs_header = info->userhdr;
 	struct ovs_header *ovs_reply_header;
 	struct nlattr *nla, *band_nla;
-	int err;
+	struct sk_buff *reply;
+	struct datapath *dp;
+	int err = -EMSGSIZE;
 
 	reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES,
 					  &ovs_reply_header);
 	if (IS_ERR(reply))
 		return PTR_ERR(reply);
 
-	if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) ||
-	    nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
+	ovs_lock();
+	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+	if (!dp) {
+		err = -ENODEV;
+		goto exit_unlock;
+	}
+
+	if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS,
+			dp->meter_tbl.max_meters_allowed))
+		goto exit_unlock;
+
+	ovs_unlock();
+
+	if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
 		goto nla_put_failure;
 
 	nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS);
@@ -296,9 +320,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info)
 	genlmsg_end(reply, ovs_reply_header);
 	return genlmsg_reply(reply, info);
 
+exit_unlock:
+	ovs_unlock();
 nla_put_failure:
 	nlmsg_free(reply);
-	err = -EMSGSIZE;
 	return err;
 }
 
@@ -699,15 +724,27 @@ int ovs_meters_init(struct datapath *dp)
 {
 	struct dp_meter_table *tbl = &dp->meter_tbl;
 	struct dp_meter_instance *ti;
+	unsigned long free_mem_bytes;
 
 	ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN);
 	if (!ti)
 		return -ENOMEM;
 
+	/* Allow meters in a datapath to use ~3.12% of physical memory. */
+	free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5);
+	tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter),
+				      DP_METER_NUM_MAX);
+	if (!tbl->max_meters_allowed)
+		goto out_err;
+
 	rcu_assign_pointer(tbl->ti, ti);
 	tbl->count = 0;
 
 	return 0;
+
+out_err:
+	dp_meter_instance_free(ti);
+	return -ENOMEM;
 }
 
 void ovs_meters_exit(struct datapath *dp)
diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h
index f52052d30a16..61a3ca43cd77 100644
--- a/net/openvswitch/meter.h
+++ b/net/openvswitch/meter.h
@@ -20,6 +20,7 @@ struct datapath;
 
 #define DP_MAX_BANDS		1
 #define DP_METER_ARRAY_SIZE_MIN	BIT_ULL(10)
+#define DP_METER_NUM_MAX	(200000UL)
 
 struct dp_meter_band {
 	u32 type;
@@ -50,6 +51,7 @@ struct dp_meter_instance {
 struct dp_meter_table {
 	struct dp_meter_instance __rcu *ti;
 	u32 count;
+	u32 max_meters_allowed;
 };
 
 extern struct genl_family dp_meter_genl_family;
-- 
cgit v1.2.3


From a8e387384f554ea0b63889a7682ffcddee24df8b Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 24 Apr 2020 08:08:04 +0800
Subject: net: openvswitch: remove the unnecessary check

Before invoking the ovs_meter_cmd_reply_stats, "meter"
was checked, so don't check it agin in that function.

Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 372f4565872d..b7893b0d6423 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -242,12 +242,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id,
 	if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id))
 		goto error;
 
-	if (!meter)
-		return 0;
-
 	if (nla_put(reply, OVS_METER_ATTR_STATS,
-		    sizeof(struct ovs_flow_stats), &meter->stats) ||
-	    nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used,
+		    sizeof(struct ovs_flow_stats), &meter->stats))
+		goto error;
+
+	if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used,
 			      OVS_METER_ATTR_PAD))
 		goto error;
 
-- 
cgit v1.2.3


From c77350089052cafa8125169e37463ab7028d6a18 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 24 Apr 2020 08:08:05 +0800
Subject: net: openvswitch: make EINVAL return value more obvious

Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index b7893b0d6423..e36b464b32a5 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -419,9 +419,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	u32 meter_id;
 	bool failed;
 
-	if (!a[OVS_METER_ATTR_ID]) {
-		return -ENODEV;
-	}
+	if (!a[OVS_METER_ATTR_ID])
+		return -EINVAL;
 
 	meter = dp_meter_create(a);
 	if (IS_ERR_OR_NULL(meter))
-- 
cgit v1.2.3


From e57358873bb5d6caa882b9684f59140912b37dde Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 24 Apr 2020 08:08:06 +0800
Subject: net: openvswitch: use u64 for meter bucket

When setting the meter rate to 4+Gbps, there is an
overflow, the meters don't work as expected.

Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: Andy Zhou <azhou@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 2 +-
 net/openvswitch/meter.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index e36b464b32a5..915f31123f23 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -392,7 +392,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
 		 *
 		 * Start with a full bucket.
 		 */
-		band->bucket = (band->burst_size + band->rate) * 1000;
+		band->bucket = (band->burst_size + band->rate) * 1000ULL;
 		band_max_delta_t = band->bucket / band->rate;
 		if (band_max_delta_t > meter->max_delta_t)
 			meter->max_delta_t = band_max_delta_t;
diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h
index 61a3ca43cd77..0c33889a8515 100644
--- a/net/openvswitch/meter.h
+++ b/net/openvswitch/meter.h
@@ -26,7 +26,7 @@ struct dp_meter_band {
 	u32 type;
 	u32 rate;
 	u32 burst_size;
-	u32 bucket; /* 1/1000 packets, or in bits */
+	u64 bucket; /* 1/1000 packets, or in bits */
 	struct ovs_flow_stats stats;
 };
 
-- 
cgit v1.2.3


From 1db364c88695272e3410eb4b5d4595c8cb15db30 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 17 Apr 2020 12:38:04 +0200
Subject: mac80211: mlme: remove duplicate AID bookkeeping

Maintain the connection AID only in sdata->vif.bss_conf.aid, not
also in sdata->u.mgd.aid.

Keep setting that where we set ifmgd->aid before, which has the
side effect of exposing the AID to the driver before the station
entry (AP) is marked associated, in case it needs it then.

Requested-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/20200417123802.085d4a322b0c.I2e7a2ceceea8c6880219f9e9ee4d4ac985fd295a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs_netdev.c | 2 +-
 net/mac80211/ieee80211_i.h    | 2 --
 net/mac80211/mlme.c           | 7 +++----
 net/mac80211/tdls.c           | 3 +--
 net/mac80211/tx.c             | 2 +-
 5 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 3dbe7c5cefd1..d7e955127d5c 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -236,7 +236,7 @@ IEEE80211_IF_FILE_R(hw_queues);
 
 /* STA attributes */
 IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
-IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(aid, vif.bss_conf.aid, DEC);
 IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
 
 static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f8ed4f621f7f..934a91bef575 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -450,8 +450,6 @@ struct ieee80211_if_managed {
 
 	u8 bssid[ETH_ALEN] __aligned(2);
 
-	u16 aid;
-
 	bool powersave; /* powersave requested for this iface */
 	bool broken_ap; /* AP is broken -- turn off powersave */
 	bool have_beacon;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 16d75da0996a..7139335f29c0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3249,7 +3249,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		return false;
 	}
 
-	ifmgd->aid = aid;
+	sdata->vif.bss_conf.aid = aid;
 	ifmgd->tdls_chan_switch_prohibited =
 		elems->ext_capab && elems->ext_capab_len >= 5 &&
 		(elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED);
@@ -3521,9 +3521,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		bss_conf->protected_keep_alive = false;
 	}
 
-	/* set AID and assoc capability,
+	/* set assoc capability (AID was already set earlier),
 	 * ieee80211_set_associated() will tell the driver */
-	bss_conf->aid = aid;
 	bss_conf->assoc_capability = capab_info;
 	ieee80211_set_associated(sdata, cbss, changed);
 
@@ -3948,7 +3947,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					  mgmt->bssid, bssid);
 
 	if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
-	    ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) {
+	    ieee80211_check_tim(elems.tim, elems.tim_len, bss_conf->aid)) {
 		if (local->hw.conf.dynamic_ps_timeout > 0) {
 			if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 				local->hw.conf.flags &= ~IEEE80211_CONF_PS;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index fca1f5477396..7ff22f9d6e80 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -226,12 +226,11 @@ static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata,
 static void
 ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
 {
-	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u8 *pos = skb_put(skb, 4);
 
 	*pos++ = WLAN_EID_AID;
 	*pos++ = 2; /* len */
-	put_unaligned_le16(ifmgd->aid, pos);
+	put_unaligned_le16(sdata->vif.bss_conf.aid, pos);
 }
 
 /* translate numbering in the WMM parameter IE to the mac80211 notation */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 82846aca86d9..3dc1990e15c5 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5006,7 +5006,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
 	pspoll = skb_put_zero(skb, sizeof(*pspoll));
 	pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
 					    IEEE80211_STYPE_PSPOLL);
-	pspoll->aid = cpu_to_le16(ifmgd->aid);
+	pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid);
 
 	/* aid in PS-Poll has its two MSBs each set to 1 */
 	pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
-- 
cgit v1.2.3


From 90e8f58dfc04d1bd48ca155cc55ebf7ba1824864 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 17 Apr 2020 11:18:31 +0200
Subject: mac80211: fix drv_config_iface_filter() behaviour

There are two bugs with this, first, it shouldn't be called
on an interface that's down, and secondly, it should then be
called when the interface comes up.

Note that the currently only user (iwlwifi) doesn't seem to
care about either of these scenarios.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200417111830.401d82c7a0bf.I5dc7d718816460c2d8d89c7af6c215f9e2b3078f@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c   | 15 +++++++++------
 net/mac80211/iface.c |  5 +++++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 0f72813fed53..b90f2131ec7a 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3421,12 +3421,15 @@ static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
 		if (!local->open_count)
 			break;
 
-		if (sdata->vif.probe_req_reg == 1)
-			drv_config_iface_filter(local, sdata, FIF_PROBE_REQ,
-						FIF_PROBE_REQ);
-		else if (sdata->vif.probe_req_reg == 0)
-			drv_config_iface_filter(local, sdata, 0,
-						FIF_PROBE_REQ);
+		if (ieee80211_sdata_running(sdata)) {
+			if (sdata->vif.probe_req_reg == 1)
+				drv_config_iface_filter(local, sdata,
+							FIF_PROBE_REQ,
+							FIF_PROBE_REQ);
+			else if (sdata->vif.probe_req_reg == 0)
+				drv_config_iface_filter(local, sdata, 0,
+							FIF_PROBE_REQ);
+		}
 
 		ieee80211_configure_filter(local);
 		break;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d069825705d6..f900c84fb40f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -644,6 +644,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 			local->fif_probe_req++;
 		}
 
+		if (sdata->vif.probe_req_reg)
+			drv_config_iface_filter(local, sdata,
+						FIF_PROBE_REQ,
+						FIF_PROBE_REQ);
+
 		if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
 		    sdata->vif.type != NL80211_IFTYPE_NAN)
 			changed |= ieee80211_reset_erp_info(sdata);
-- 
cgit v1.2.3


From 4d797fce783a8eb11dd23463828db84743795046 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@codeaurora.org>
Date: Wed, 1 Apr 2020 17:25:47 +0300
Subject: cfg80211: Unprotected Beacon frame RX indication

Extend cfg80211_rx_unprot_mlme_mgmt() to cover indication of unprotected
Beacon frames in addition to the previously used Deauthentication and
Disassociation frames. The Beacon frame case is quite similar, but has
couple of exceptions: this is used both with fully unprotected and also
incorrectly protected frames and there is a rate limit on the events to
avoid unnecessary flooding netlink events in case something goes wrong.

Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Link: https://lore.kernel.org/r/20200401142548.6990-1-jouni@codeaurora.org
[add missing kernel-doc]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 12 ++++++++++--
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/nl80211.c       | 13 +++++++++++--
 net/wireless/sme.c           |  2 ++
 4 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 70e48f66dac8..775952677b3d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5045,6 +5045,8 @@ struct cfg80211_cqm_config;
  * @pmsr_list: (private) peer measurement requests
  * @pmsr_lock: (private) peer measurements requests/results lock
  * @pmsr_free_wk: (private) peer measurements cleanup work
+ * @unprot_beacon_reported: (private) timestamp of last
+ *	unprotected beacon report
  */
 struct wireless_dev {
 	struct wiphy *wiphy;
@@ -5121,6 +5123,8 @@ struct wireless_dev {
 	struct list_head pmsr_list;
 	spinlock_t pmsr_lock;
 	struct work_struct pmsr_free_wk;
+
+	unsigned long unprot_beacon_reported;
 };
 
 static inline u8 *wdev_address(struct wireless_dev *wdev)
@@ -6135,12 +6139,16 @@ void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);
 /**
  * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame
  * @dev: network device
- * @buf: deauthentication frame (header + body)
+ * @buf: received management frame (header + body)
  * @len: length of the frame data
  *
  * This function is called whenever a received deauthentication or dissassoc
  * frame has been dropped in station mode because of MFP being used but the
- * frame was not protected. This function may sleep.
+ * frame was not protected. This is also used to notify reception of a Beacon
+ * frame that was dropped because it did not include a valid MME MIC while
+ * beacon protection was enabled (BIGTK configured in station mode).
+ *
+ * This function may sleep.
  */
 void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev,
 				  const u8 *buf, size_t len);
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 2b691161830f..afdd9802ccb8 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1151,6 +1151,11 @@
  * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration
  *	is passed using %NL80211_ATTR_TID_CONFIG attribute.
  *
+ * @NL80211_CMD_UNPROT_BEACON: Unprotected or incorrectly protected Beacon
+ *	frame. This event is used to indicate that a received Beacon frame was
+ *	dropped because it did not include a valid MME MIC while beacon
+ *	protection was enabled (BIGTK configured in station mode).
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1377,6 +1382,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_TID_CONFIG,
 
+	NL80211_CMD_UNPROT_BEACON,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 692bcd35f809..2127e5344b1a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -15542,10 +15542,19 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf,
 	if (WARN_ON(len < 2))
 		return;
 
-	if (ieee80211_is_deauth(mgmt->frame_control))
+	if (ieee80211_is_deauth(mgmt->frame_control)) {
 		cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE;
-	else
+	} else if (ieee80211_is_disassoc(mgmt->frame_control)) {
 		cmd = NL80211_CMD_UNPROT_DISASSOCIATE;
+	} else if (ieee80211_is_beacon(mgmt->frame_control)) {
+		if (wdev->unprot_beacon_reported &&
+		    elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000)
+			return;
+		cmd = NL80211_CMD_UNPROT_BEACON;
+		wdev->unprot_beacon_reported = jiffies;
+	} else {
+		return;
+	}
 
 	trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len);
 	nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index ac3e60aa1fc8..3554c0d951f4 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -694,6 +694,7 @@ void __cfg80211_connect_result(struct net_device *dev,
 		return;
 	}
 
+	wdev->unprot_beacon_reported = 0;
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr,
 				    GFP_KERNEL);
 
@@ -921,6 +922,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 	cfg80211_hold_bss(bss_from_pub(info->bss));
 	wdev->current_bss = bss_from_pub(info->bss);
 
+	wdev->unprot_beacon_reported = 0;
 	nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
 			    wdev->netdev, info, GFP_KERNEL);
 
-- 
cgit v1.2.3


From 9eaf183af741e3d8393eb571ac8aec9ee7d6530e Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@codeaurora.org>
Date: Wed, 1 Apr 2020 17:25:48 +0300
Subject: mac80211: Report beacon protection failures to user space

Report received Beacon frames that do not have a valid MME MIC when
beacon protection is enabled. This covers both the cases of no MME in
the received frame and invalid MIC in the MME.

Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Link: https://lore.kernel.org/r/20200401142548.6990-2-jouni@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 91a13aee4378..a724551b8ddf 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1984,8 +1984,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 		if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS ||
 		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
-		    NUM_DEFAULT_BEACON_KEYS)
+		    NUM_DEFAULT_BEACON_KEYS) {
+			cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
+						     skb->data,
+						     skb->len);
 			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+		}
 
 		rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
 		if (!rx->key)
@@ -2131,6 +2135,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	/* either the frame has been decrypted or will be dropped */
 	status->flag |= RX_FLAG_DECRYPTED;
 
+	if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE))
+		cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
+					     skb->data, skb->len);
+
 	return result;
 }
 
@@ -2411,8 +2419,12 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
 			return -EACCES;
 		}
 		if (unlikely(ieee80211_is_beacon(fc) && rx->key &&
-			     ieee80211_get_mmie_keyidx(rx->skb) < 0))
+			     ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
+			cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
+						     rx->skb->data,
+						     rx->skb->len);
 			return -EACCES;
+		}
 		/*
 		 * When using MFP, Action frames are not allowed prior to
 		 * having configured keys.
-- 
cgit v1.2.3


From 6cd536fe62ef58d7c4eac2da07ab0ed7fd19010d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 17 Apr 2020 12:43:01 +0200
Subject: cfg80211: change internal management frame registration API

Almost all drivers below cfg80211 get the API wrong (except for
cfg80211) and are unable to cope with multiple registrations for
the same frame type, which is valid due to the match filter.
This seems to indicate the API is wrong, and we should maintain
the full information in cfg80211 instead of the drivers.

Change the API to no longer inform the driver about individual
registrations and unregistrations, but rather every time about
the entire state of the entire wiphy and single wdev, whenever
it may have changed. This also simplifies the code in cfg80211
as it no longer has to track exactly what was unregistered and
can free things immediately.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200417124300.f47f3828afc8.I7f81ef59c2c5a340d7075fb3c6d0e08e8aeffe07@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         | 26 ++++---
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         | 19 ++---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    | 16 ++---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  | 83 +++++++++++-----------
 include/net/cfg80211.h                             | 23 ++++--
 include/net/mac80211.h                             |  2 +-
 net/mac80211/cfg.c                                 | 50 ++++++-------
 net/mac80211/ieee80211_i.h                         |  2 +-
 net/wireless/core.c                                |  7 +-
 net/wireless/core.h                                |  6 +-
 net/wireless/mlme.c                                | 72 ++++++++-----------
 net/wireless/rdev-ops.h                            | 11 +--
 net/wireless/trace.h                               | 20 +++---
 13 files changed, 159 insertions(+), 178 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 37cf602d8adf..67f8f2aa7a53 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -3249,22 +3249,19 @@ static int ath6kl_get_antenna(struct wiphy *wiphy,
 	return 0;
 }
 
-static void ath6kl_mgmt_frame_register(struct wiphy *wiphy,
-				       struct wireless_dev *wdev,
-				       u16 frame_type, bool reg)
+static void ath6kl_update_mgmt_frame_registrations(struct wiphy *wiphy,
+						   struct wireless_dev *wdev,
+						   struct mgmt_frame_regs *upd)
 {
 	struct ath6kl_vif *vif = ath6kl_vif_from_wdev(wdev);
 
-	ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: frame_type=0x%x reg=%d\n",
-		   __func__, frame_type, reg);
-	if (frame_type == IEEE80211_STYPE_PROBE_REQ) {
-		/*
-		 * Note: This notification callback is not allowed to sleep, so
-		 * we cannot send WMI_PROBE_REQ_REPORT_CMD here. Instead, we
-		 * hardcode target to report Probe Request frames all the time.
-		 */
-		vif->probe_req_report = reg;
-	}
+	/*
+	 * FIXME: send WMI_PROBE_REQ_REPORT_CMD here instead of hardcoding
+	 *	  the reporting in the target all the time, this callback
+	 *	  *is* allowed to sleep after all.
+	 */
+	vif->probe_req_report =
+		upd->interface_stypes & BIT(IEEE80211_STYPE_PROBE_REQ >> 4);
 }
 
 static int ath6kl_cfg80211_sscan_start(struct wiphy *wiphy,
@@ -3464,7 +3461,8 @@ static struct cfg80211_ops ath6kl_cfg80211_ops = {
 	.remain_on_channel = ath6kl_remain_on_channel,
 	.cancel_remain_on_channel = ath6kl_cancel_remain_on_channel,
 	.mgmt_tx = ath6kl_mgmt_tx,
-	.mgmt_frame_register = ath6kl_mgmt_frame_register,
+	.update_mgmt_frame_registrations =
+		ath6kl_update_mgmt_frame_registrations,
 	.get_antenna = ath6kl_get_antenna,
 	.sched_scan_start = ath6kl_cfg80211_sscan_start,
 	.sched_scan_stop = ath6kl_cfg80211_sscan_stop,
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 2ba165330038..fa846471dac2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -4979,21 +4979,15 @@ brcmf_cfg80211_change_station(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static void
-brcmf_cfg80211_mgmt_frame_register(struct wiphy *wiphy,
-				   struct wireless_dev *wdev,
-				   u16 frame_type, bool reg)
+brcmf_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
+					       struct wireless_dev *wdev,
+					       struct mgmt_frame_regs *upd)
 {
 	struct brcmf_cfg80211_vif *vif;
-	u16 mgmt_type;
 
-	brcmf_dbg(TRACE, "Enter, frame_type %04x, reg=%d\n", frame_type, reg);
-
-	mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
 	vif = container_of(wdev, struct brcmf_cfg80211_vif, wdev);
-	if (reg)
-		vif->mgmt_rx_reg |= BIT(mgmt_type);
-	else
-		vif->mgmt_rx_reg &= ~BIT(mgmt_type);
+
+	vif->mgmt_rx_reg = upd->interface_stypes;
 }
 
 
@@ -5408,7 +5402,8 @@ static struct cfg80211_ops brcmf_cfg80211_ops = {
 	.change_station = brcmf_cfg80211_change_station,
 	.sched_scan_start = brcmf_cfg80211_sched_scan_start,
 	.sched_scan_stop = brcmf_cfg80211_sched_scan_stop,
-	.mgmt_frame_register = brcmf_cfg80211_mgmt_frame_register,
+	.update_mgmt_frame_registrations =
+		brcmf_cfg80211_update_mgmt_frame_registrations,
 	.mgmt_tx = brcmf_cfg80211_mgmt_tx,
 	.remain_on_channel = brcmf_p2p_remain_on_channel,
 	.cancel_remain_on_channel = brcmf_cfg80211_cancel_remain_on_channel,
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 1566d2197906..21a17d4017c4 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -269,17 +269,12 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
  * CFG802.11 operation handler to register a mgmt frame.
  */
 static void
-mwifiex_cfg80211_mgmt_frame_register(struct wiphy *wiphy,
-				     struct wireless_dev *wdev,
-				     u16 frame_type, bool reg)
+mwifiex_cfg80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
+						 struct wireless_dev *wdev,
+						 struct mgmt_frame_regs *upd)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev);
-	u32 mask;
-
-	if (reg)
-		mask = priv->mgmt_frame_mask | BIT(frame_type >> 4);
-	else
-		mask = priv->mgmt_frame_mask & ~BIT(frame_type >> 4);
+	u32 mask = upd->interface_stypes;
 
 	if (mask != priv->mgmt_frame_mask) {
 		priv->mgmt_frame_mask = mask;
@@ -4189,7 +4184,8 @@ static struct cfg80211_ops mwifiex_cfg80211_ops = {
 	.del_key = mwifiex_cfg80211_del_key,
 	.set_default_mgmt_key = mwifiex_cfg80211_set_default_mgmt_key,
 	.mgmt_tx = mwifiex_cfg80211_mgmt_tx,
-	.mgmt_frame_register = mwifiex_cfg80211_mgmt_frame_register,
+	.update_mgmt_frame_registrations =
+		mwifiex_cfg80211_update_mgmt_frame_registrations,
 	.remain_on_channel = mwifiex_cfg80211_remain_on_channel,
 	.cancel_remain_on_channel = mwifiex_cfg80211_cancel_remain_on_channel,
 	.set_default_key = mwifiex_cfg80211_set_default_key,
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 8be17106008d..54cdf3ad09d7 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -389,55 +389,57 @@ static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 }
 
 static void
-qtnf_mgmt_frame_register(struct wiphy *wiphy, struct wireless_dev *wdev,
-			 u16 frame_type, bool reg)
+qtnf_update_mgmt_frame_registrations(struct wiphy *wiphy,
+				     struct wireless_dev *wdev,
+				     struct mgmt_frame_regs *upd)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev);
-	u16 mgmt_type;
-	u16 new_mask;
-	u16 qlink_frame_type = 0;
+	u16 new_mask = upd->interface_stypes;
+	u16 old_mask = vif->mgmt_frames_bitmask;
+	static const struct {
+		u16 mask, qlink_type;
+	} updates[] = {
+		{
+			.mask = BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+				BIT(IEEE80211_STYPE_ASSOC_REQ >> 4),
+			.qlink_type = QLINK_MGMT_FRAME_ASSOC_REQ,
+		},
+		{
+			.mask = BIT(IEEE80211_STYPE_AUTH >> 4),
+			.qlink_type = QLINK_MGMT_FRAME_AUTH,
+		},
+		{
+			.mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+			.qlink_type = QLINK_MGMT_FRAME_PROBE_REQ,
+		},
+		{
+			.mask = BIT(IEEE80211_STYPE_ACTION >> 4),
+			.qlink_type = QLINK_MGMT_FRAME_ACTION,
+		},
+	};
+	unsigned int i;
 
-	mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
+	if (new_mask == old_mask)
+		return;
 
-	if (reg)
-		new_mask = vif->mgmt_frames_bitmask | BIT(mgmt_type);
-	else
-		new_mask = vif->mgmt_frames_bitmask & ~BIT(mgmt_type);
+	for (i = 0; i < ARRAY_SIZE(updates); i++) {
+		u16 mask = updates[i].mask;
+		u16 qlink_frame_type = updates[i].qlink_type;
+		bool reg;
 
-	if (new_mask == vif->mgmt_frames_bitmask)
-		return;
+		/* the ! are here due to the assoc/reassoc merge */
+		if (!(new_mask & mask) == !(old_mask & mask))
+			continue;
 
-	switch (frame_type & IEEE80211_FCTL_STYPE) {
-	case IEEE80211_STYPE_REASSOC_REQ:
-	case IEEE80211_STYPE_ASSOC_REQ:
-		qlink_frame_type = QLINK_MGMT_FRAME_ASSOC_REQ;
-		break;
-	case IEEE80211_STYPE_AUTH:
-		qlink_frame_type = QLINK_MGMT_FRAME_AUTH;
-		break;
-	case IEEE80211_STYPE_PROBE_REQ:
-		qlink_frame_type = QLINK_MGMT_FRAME_PROBE_REQ;
-		break;
-	case IEEE80211_STYPE_ACTION:
-		qlink_frame_type = QLINK_MGMT_FRAME_ACTION;
-		break;
-	default:
-		pr_warn("VIF%u.%u: unsupported frame type: %X\n",
-			vif->mac->macid, vif->vifid,
-			(frame_type & IEEE80211_FCTL_STYPE) >> 4);
-		return;
-	}
+		reg = new_mask & mask;
 
-	if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg)) {
-		pr_warn("VIF%u.%u: failed to %sregister mgmt frame type 0x%x\n",
-			vif->mac->macid, vif->vifid, reg ? "" : "un",
-			frame_type);
-		return;
+		if (qtnf_cmd_send_register_mgmt(vif, qlink_frame_type, reg))
+			pr_warn("VIF%u.%u: failed to %sregister qlink frame type 0x%x\n",
+				vif->mac->macid, vif->vifid, reg ? "" : "un",
+				qlink_frame_type);
 	}
 
 	vif->mgmt_frames_bitmask = new_mask;
-	pr_debug("VIF%u.%u: %sregistered mgmt frame type 0x%x\n",
-		 vif->mac->macid, vif->vifid, reg ? "" : "un", frame_type);
 }
 
 static int
@@ -1017,7 +1019,8 @@ static struct cfg80211_ops qtn_cfg80211_ops = {
 	.change_beacon		= qtnf_change_beacon,
 	.stop_ap		= qtnf_stop_ap,
 	.set_wiphy_params	= qtnf_set_wiphy_params,
-	.mgmt_frame_register	= qtnf_mgmt_frame_register,
+	.update_mgmt_frame_registrations =
+		qtnf_update_mgmt_frame_registrations,
 	.mgmt_tx		= qtnf_mgmt_tx,
 	.change_station		= qtnf_change_station,
 	.del_station		= qtnf_del_station,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 775952677b3d..bc273f6d60f2 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3384,6 +3384,17 @@ struct cfg80211_update_owe_info {
 	size_t ie_len;
 };
 
+/**
+ * struct mgmt_frame_regs - management frame registrations data
+ * @global_stypes: bitmap of management frame subtypes registered
+ *	for the entire device
+ * @interface_stypes: bitmap of management frame subtypes registered
+ *	for the given interface
+ */
+struct mgmt_frame_regs {
+	u32 global_stypes, interface_stypes;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -3608,8 +3619,8 @@ struct cfg80211_update_owe_info {
  *	The driver should not call cfg80211_sched_scan_stopped() for a requested
  *	stop (when this method returns 0).
  *
- * @mgmt_frame_register: Notify driver that a management frame type was
- *	registered. The callback is allowed to sleep.
+ * @update_mgmt_frame_registrations: Notify the driver that management frame
+ *	registrations were updated. The callback is allowed to sleep.
  *
  * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
  *	Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
@@ -3932,9 +3943,9 @@ struct cfg80211_ops {
 				      struct net_device *dev,
 				      u32 rate, u32 pkts, u32 intvl);
 
-	void	(*mgmt_frame_register)(struct wiphy *wiphy,
-				       struct wireless_dev *wdev,
-				       u16 frame_type, bool reg);
+	void	(*update_mgmt_frame_registrations)(struct wiphy *wiphy,
+						   struct wireless_dev *wdev,
+						   struct mgmt_frame_regs *upd);
 
 	int	(*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
 	int	(*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);
@@ -5015,6 +5026,7 @@ struct cfg80211_cqm_config;
  *	by cfg80211 on change_interface
  * @mgmt_registrations: list of registrations for management frames
  * @mgmt_registrations_lock: lock for the list
+ * @mgmt_registrations_update_wk: update work to defer from atomic context
  * @mtx: mutex used to lock data in this struct, may be used by drivers
  *	and some API functions require it held
  * @beacon_interval: beacon interval used on this device for transmitting
@@ -5060,6 +5072,7 @@ struct wireless_dev {
 
 	struct list_head mgmt_registrations;
 	spinlock_t mgmt_registrations_lock;
+	struct work_struct mgmt_registrations_update_wk;
 
 	struct mutex mtx;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b6b4de0e4b5e..f6dc5a38720f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1647,7 +1647,7 @@ struct ieee80211_vif {
 	struct dentry *debugfs_dir;
 #endif
 
-	unsigned int probe_req_reg;
+	bool probe_req_reg;
 
 	bool txqs_stopped[IEEE80211_NUM_ACS];
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b90f2131ec7a..e62b4764e82e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3398,44 +3398,35 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
 	return 0;
 }
 
-static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+static void
+ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
 					  struct wireless_dev *wdev,
-					  u16 frame_type, bool reg)
+					  struct mgmt_frame_regs *upd)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+	u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4);
+	bool global_change, intf_change;
 
-	switch (frame_type) {
-	case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ:
-		if (reg) {
-			local->probe_req_reg++;
-			sdata->vif.probe_req_reg++;
-		} else {
-			if (local->probe_req_reg)
-				local->probe_req_reg--;
+	global_change =
+		local->probe_req_reg != !!(upd->global_stypes & preq_mask);
+	local->probe_req_reg = upd->global_stypes & preq_mask;
 
-			if (sdata->vif.probe_req_reg)
-				sdata->vif.probe_req_reg--;
-		}
+	intf_change = sdata->vif.probe_req_reg !=
+				!!(upd->interface_stypes & preq_mask);
+	sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask;
 
-		if (!local->open_count)
-			break;
+	if (!local->open_count)
+		return;
 
-		if (ieee80211_sdata_running(sdata)) {
-			if (sdata->vif.probe_req_reg == 1)
-				drv_config_iface_filter(local, sdata,
-							FIF_PROBE_REQ,
-							FIF_PROBE_REQ);
-			else if (sdata->vif.probe_req_reg == 0)
-				drv_config_iface_filter(local, sdata, 0,
-							FIF_PROBE_REQ);
-		}
+	if (intf_change && ieee80211_sdata_running(sdata))
+		drv_config_iface_filter(local, sdata,
+					sdata->vif.probe_req_reg ?
+						FIF_PROBE_REQ : 0,
+					FIF_PROBE_REQ);
 
+	if (global_change)
 		ieee80211_configure_filter(local);
-		break;
-	default:
-		break;
-	}
 }
 
 static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
@@ -4020,7 +4011,8 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait,
 	.set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
 	.set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config,
-	.mgmt_frame_register = ieee80211_mgmt_frame_register,
+	.update_mgmt_frame_registrations =
+		ieee80211_update_mgmt_frame_registrations,
 	.set_antenna = ieee80211_set_antenna,
 	.get_antenna = ieee80211_get_antenna,
 	.set_rekey_data = ieee80211_set_rekey_data,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 934a91bef575..da41ee996d3d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1167,7 +1167,7 @@ struct ieee80211_local {
 	/* number of interfaces with corresponding FIF_ flags */
 	int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
 	    fif_probe_req;
-	int probe_req_reg;
+	bool probe_req_reg;
 	unsigned int filter_flags; /* FIF_* */
 
 	bool wiphy_ciphers_allocated;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 341402b4f178..5757dea2aa94 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -480,9 +480,6 @@ use_default_name:
 	INIT_LIST_HEAD(&rdev->bss_list);
 	INIT_LIST_HEAD(&rdev->sched_scan_req_list);
 	INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done);
-	INIT_LIST_HEAD(&rdev->mlme_unreg);
-	spin_lock_init(&rdev->mlme_unreg_lock);
-	INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk);
 	INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk,
 			  cfg80211_dfs_channels_update_work);
 #ifdef CONFIG_CFG80211_WEXT
@@ -1030,7 +1027,6 @@ void wiphy_unregister(struct wiphy *wiphy)
 	cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
 	flush_work(&rdev->destroy_work);
 	flush_work(&rdev->sched_scan_stop_wk);
-	flush_work(&rdev->mlme_unreg_wk);
 	flush_work(&rdev->propagate_radar_detect_wk);
 	flush_work(&rdev->propagate_cac_done_wk);
 
@@ -1094,6 +1090,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
 	rdev->devlist_generation++;
 
 	cfg80211_mlme_purge_registrations(wdev);
+	flush_work(&wdev->mgmt_registrations_update_wk);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_P2P_DEVICE:
@@ -1238,6 +1235,8 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
 	spin_lock_init(&wdev->event_lock);
 	INIT_LIST_HEAD(&wdev->mgmt_registrations);
 	spin_lock_init(&wdev->mgmt_registrations_lock);
+	INIT_WORK(&wdev->mgmt_registrations_update_wk,
+		  cfg80211_mgmt_registrations_update_wk);
 	INIT_LIST_HEAD(&wdev->pmsr_list);
 	spin_lock_init(&wdev->pmsr_lock);
 	INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index bb897a803ffe..30fb2c35ae43 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -60,10 +60,6 @@ struct cfg80211_registered_device {
 	struct list_head beacon_registrations;
 	spinlock_t beacon_registrations_lock;
 
-	struct list_head mlme_unreg;
-	spinlock_t mlme_unreg_lock;
-	struct work_struct mlme_unreg_wk;
-
 	/* protected by RTNL only */
 	int num_running_ifaces;
 	int num_running_monitor_ifaces;
@@ -386,7 +382,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
 				u16 frame_type, const u8 *match_data,
 				int match_len, struct netlink_ext_ack *extack);
-void cfg80211_mlme_unreg_wk(struct work_struct *wk);
+void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk);
 void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index e4805a3bd310..2e1a21e90b83 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -429,43 +429,37 @@ struct cfg80211_mgmt_registration {
 	u8 match[];
 };
 
-static void
-cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev)
+static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
 {
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct wireless_dev *tmp;
 	struct cfg80211_mgmt_registration *reg;
+	struct mgmt_frame_regs upd = {};
 
 	ASSERT_RTNL();
 
-	spin_lock_bh(&rdev->mlme_unreg_lock);
-	while ((reg = list_first_entry_or_null(&rdev->mlme_unreg,
-					       struct cfg80211_mgmt_registration,
-					       list))) {
-		list_del(&reg->list);
-		spin_unlock_bh(&rdev->mlme_unreg_lock);
-
-		if (rdev->ops->mgmt_frame_register) {
-			u16 frame_type = le16_to_cpu(reg->frame_type);
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) {
+		list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) {
+			u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4);
 
-			rdev_mgmt_frame_register(rdev, reg->wdev,
-						 frame_type, false);
+			upd.global_stypes |= mask;
+			if (tmp == wdev)
+				upd.interface_stypes |= mask;
 		}
-
-		kfree(reg);
-
-		spin_lock_bh(&rdev->mlme_unreg_lock);
 	}
-	spin_unlock_bh(&rdev->mlme_unreg_lock);
+	rcu_read_unlock();
+
+	rdev_update_mgmt_frame_registrations(rdev, wdev, &upd);
 }
 
-void cfg80211_mlme_unreg_wk(struct work_struct *wk)
+void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk)
 {
-	struct cfg80211_registered_device *rdev;
-
-	rdev = container_of(wk, struct cfg80211_registered_device,
-			    mlme_unreg_wk);
+	struct wireless_dev *wdev = container_of(wk, struct wireless_dev,
+						 mgmt_registrations_update_wk);
 
 	rtnl_lock();
-	cfg80211_process_mlme_unregistrations(rdev);
+	cfg80211_mgmt_registrations_update(wdev);
 	rtnl_unlock();
 }
 
@@ -473,8 +467,6 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 				u16 frame_type, const u8 *match_data,
 				int match_len, struct netlink_ext_ack *extack)
 {
-	struct wiphy *wiphy = wdev->wiphy;
-	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct cfg80211_mgmt_registration *reg, *nreg;
 	int err = 0;
 	u16 mgmt_type;
@@ -534,10 +526,8 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 		}
 	}
 
-	if (err) {
-		kfree(nreg);
+	if (err)
 		goto out;
-	}
 
 	memcpy(nreg->match, match_data, match_len);
 	nreg->match_len = match_len;
@@ -547,15 +537,12 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 	list_add(&nreg->list, &wdev->mgmt_registrations);
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
-	/* process all unregistrations to avoid driver confusion */
-	cfg80211_process_mlme_unregistrations(rdev);
-
-	if (rdev->ops->mgmt_frame_register)
-		rdev_mgmt_frame_register(rdev, wdev, frame_type, true);
+	cfg80211_mgmt_registrations_update(wdev);
 
 	return 0;
 
  out:
+	kfree(nreg);
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
 	return err;
@@ -574,11 +561,9 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
 			continue;
 
 		list_del(&reg->list);
-		spin_lock(&rdev->mlme_unreg_lock);
-		list_add_tail(&reg->list, &rdev->mlme_unreg);
-		spin_unlock(&rdev->mlme_unreg_lock);
+		kfree(reg);
 
-		schedule_work(&rdev->mlme_unreg_wk);
+		schedule_work(&wdev->mgmt_registrations_update_wk);
 	}
 
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
@@ -594,15 +579,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
 
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
 {
-	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_mgmt_registration *reg, *tmp;
 
 	spin_lock_bh(&wdev->mgmt_registrations_lock);
-	spin_lock(&rdev->mlme_unreg_lock);
-	list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg);
-	spin_unlock(&rdev->mlme_unreg_lock);
+	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+		list_del(&reg->list);
+		kfree(reg);
+	}
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
-	cfg80211_process_mlme_unregistrations(rdev);
+	cfg80211_mgmt_registrations_update(wdev);
 }
 
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 99462f0c4e08..df5142e86c4f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -819,13 +819,16 @@ rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev,
 }
 
 static inline void
-rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev,
-			 struct wireless_dev *wdev, u16 frame_type, bool reg)
+rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev,
+				     struct wireless_dev *wdev,
+				     struct mgmt_frame_regs *upd)
 {
 	might_sleep();
 
-	trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
-	rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
+	trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd);
+	if (rdev->ops->update_mgmt_frame_registrations)
+		rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev,
+							   upd);
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 839df54cee21..ee736620f1e3 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1582,25 +1582,25 @@ TRACE_EVENT(rdev_set_bitrate_mask,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
 );
 
-TRACE_EVENT(rdev_mgmt_frame_register,
+TRACE_EVENT(rdev_update_mgmt_frame_registrations,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
-		 u16 frame_type, bool reg),
-	TP_ARGS(wiphy, wdev, frame_type, reg),
+		 struct mgmt_frame_regs *upd),
+	TP_ARGS(wiphy, wdev, upd),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		WDEV_ENTRY
-		__field(u16, frame_type)
-		__field(bool, reg)
+		__field(u16, global_stypes)
+		__field(u16, interface_stypes)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
-		__entry->frame_type = frame_type;
-		__entry->reg = reg;
+		__entry->global_stypes = upd->global_stypes;
+		__entry->interface_stypes = upd->interface_stypes;
 	),
-	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", frame_type: 0x%.2x, reg: %s ",
-		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->frame_type,
-		  __entry->reg ? "true" : "false")
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x",
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  __entry->global_stypes, __entry->interface_stypes)
 );
 
 TRACE_EVENT(rdev_return_int_tx_rx,
-- 
cgit v1.2.3


From 9dba48a6ece79da064655736dc7347a5fcadedef Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 17 Apr 2020 12:40:15 +0200
Subject: cfg80211: support multicast RX registration

For DPP, there's a need to receive multicast action frames,
but many drivers need a special filter configuration for this.

Support announcing from userspace in the management registration
that multicast RX is required, with an extended feature flag if
the driver handles this.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200417124013.c46238801048.Ib041d437ce0bff28a0c6d5dc915f68f1d8591002@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 13 +++++++++++++
 net/wireless/core.h          |  3 ++-
 net/wireless/mlme.c          | 38 ++++++++++++++++++++++++++++++--------
 net/wireless/nl80211.c       | 10 ++++++++++
 5 files changed, 59 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index bc273f6d60f2..dbb9675fe38f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3390,9 +3390,13 @@ struct cfg80211_update_owe_info {
  *	for the entire device
  * @interface_stypes: bitmap of management frame subtypes registered
  *	for the given interface
+ * @global_mcast_rx: mcast RX is needed globally for these subtypes
+ * @interface_mcast_stypes: mcast RX is needed on this interface
+ *	for these subtypes
  */
 struct mgmt_frame_regs {
 	u32 global_stypes, interface_stypes;
+	u32 global_mcast_stypes, interface_mcast_stypes;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index afdd9802ccb8..e0dc89eceab8 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -687,6 +687,10 @@
  *	four bytes for vendor frames including the OUI. The registration
  *	cannot be dropped, but is removed automatically when the netlink
  *	socket is closed. Multiple registrations can be made.
+ *	The %NL80211_ATTR_RECEIVE_MULTICAST flag attribute can be given if
+ *	%NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS is available, in which
+ *	case the registration can also be modified to include/exclude the
+ *	flag, rather than requiring unregistration to change it.
  * @NL80211_CMD_REGISTER_ACTION: Alias for @NL80211_CMD_REGISTER_FRAME for
  *	backward compatibility
  * @NL80211_CMD_FRAME: Management frame TX request and RX notification. This
@@ -2477,6 +2481,9 @@ enum nl80211_commands {
  *	no roaming occurs between the reauth threshold and PMK expiration,
  *	disassociation is still forced.
  *
+ * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the
+ *	%NL80211_CMD_REGISTER_FRAME command, see the description there.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2952,6 +2959,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_PMK_LIFETIME,
 	NL80211_ATTR_PMK_REAUTH_THRESHOLD,
 
+	NL80211_ATTR_RECEIVE_MULTICAST,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5691,6 +5700,9 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_DEL_IBSS_STA: The driver supports removing stations
  *      in IBSS mode, essentially by dropping their state.
  *
+ * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations
+ *	are possible for multicast frames and those will be reported properly.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5742,6 +5754,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH,
 	NL80211_EXT_FEATURE_PROTECTED_TWT,
 	NL80211_EXT_FEATURE_DEL_IBSS_STA,
+	NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 30fb2c35ae43..639d41896573 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -381,7 +381,8 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 			struct net_device *dev);
 int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
 				u16 frame_type, const u8 *match_data,
-				int match_len, struct netlink_ext_ack *extack);
+				int match_len, bool multicast_rx,
+				struct netlink_ext_ack *extack);
 void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk);
 void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
 void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 2e1a21e90b83..409497a3527d 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -426,6 +426,8 @@ struct cfg80211_mgmt_registration {
 
 	__le16 frame_type;
 
+	bool multicast_rx;
+
 	u8 match[];
 };
 
@@ -442,10 +444,18 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
 	list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) {
 		list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) {
 			u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4);
+			u32 mcast_mask = 0;
+
+			if (reg->multicast_rx)
+				mcast_mask = mask;
 
 			upd.global_stypes |= mask;
-			if (tmp == wdev)
+			upd.global_mcast_stypes |= mcast_mask;
+
+			if (tmp == wdev) {
 				upd.interface_stypes |= mask;
+				upd.interface_mcast_stypes |= mcast_mask;
+			}
 		}
 	}
 	rcu_read_unlock();
@@ -465,11 +475,13 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk)
 
 int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 				u16 frame_type, const u8 *match_data,
-				int match_len, struct netlink_ext_ack *extack)
+				int match_len, bool multicast_rx,
+				struct netlink_ext_ack *extack)
 {
 	struct cfg80211_mgmt_registration *reg, *nreg;
 	int err = 0;
 	u16 mgmt_type;
+	bool update_multicast = false;
 
 	if (!wdev->wiphy->mgmt_stypes)
 		return -EOPNOTSUPP;
@@ -520,6 +532,11 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 			continue;
 
 		if (memcmp(reg->match, match_data, mlen) == 0) {
+			if (reg->multicast_rx != multicast_rx) {
+				update_multicast = true;
+				reg->multicast_rx = multicast_rx;
+				break;
+			}
 			NL_SET_ERR_MSG(extack, "Match already configured");
 			err = -EALREADY;
 			break;
@@ -529,12 +546,17 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
 	if (err)
 		goto out;
 
-	memcpy(nreg->match, match_data, match_len);
-	nreg->match_len = match_len;
-	nreg->nlportid = snd_portid;
-	nreg->frame_type = cpu_to_le16(frame_type);
-	nreg->wdev = wdev;
-	list_add(&nreg->list, &wdev->mgmt_registrations);
+	if (update_multicast) {
+		kfree(nreg);
+	} else {
+		memcpy(nreg->match, match_data, match_len);
+		nreg->match_len = match_len;
+		nreg->nlportid = snd_portid;
+		nreg->frame_type = cpu_to_le16(frame_type);
+		nreg->wdev = wdev;
+		nreg->multicast_rx = multicast_rx;
+		list_add(&nreg->list, &wdev->mgmt_registrations);
+	}
 	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
 	cfg80211_mgmt_registrations_update(wdev);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2127e5344b1a..73a3e885d4dd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -661,6 +661,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG },
 	[NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1),
 	[NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100),
+	[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -10773,9 +10774,18 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->mgmt_tx)
 		return -EOPNOTSUPP;
 
+	if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) {
+		GENL_SET_ERR_MSG(info,
+				 "multicast RX registrations are not supported");
+		return -EOPNOTSUPP;
+	}
+
 	return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type,
 					   nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
 					   nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]),
+					   info->attrs[NL80211_ATTR_RECEIVE_MULTICAST],
 					   info->extack);
 }
 
-- 
cgit v1.2.3


From 155d7c733807190258639c66b36340948f369349 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 20 Apr 2020 14:06:00 +0200
Subject: nl80211: allow client-only BIGTK support

The current NL80211_EXT_FEATURE_BEACON_PROTECTION feature flag
requires both AP and client support, add a new one called
NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT that enables only
support in client (and P2P-client) modes.

Link: https://lore.kernel.org/r/20200420140559.6ba704053a5a.Ifeb869fb0b48e52fe0cb9c15572b93ac8a924f8d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  3 +++
 net/wireless/nl80211.c       | 19 +++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e0dc89eceab8..9679d561f7d0 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5690,6 +5690,8 @@ enum nl80211_feature_flags {
  *
  * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection
  *	and can receive key configuration for BIGTK using key indexes 6 and 7.
+ * @NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT: The driver supports Beacon
+ *	protection as a client only and cannot transmit protected beacons.
  *
  * @NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH: The driver can disable the
  *	forwarding of preauth frames over the control port. They are then
@@ -5755,6 +5757,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_PROTECTED_TWT,
 	NL80211_EXT_FEATURE_DEL_IBSS_STA,
 	NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS,
+	NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 73a3e885d4dd..d470d77d2eb6 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3905,14 +3905,25 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	};
 	void *hdr;
 	struct sk_buff *msg;
+	bool bigtk_support = false;
+
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_BEACON_PROTECTION))
+		bigtk_support = true;
+
+	if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION ||
+	     dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
+	    wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
+		bigtk_support = true;
 
 	if (info->attrs[NL80211_ATTR_KEY_IDX]) {
 		key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
-		if (key_idx > 5 &&
-		    !wiphy_ext_feature_isset(
-			    &rdev->wiphy,
-			    NL80211_EXT_FEATURE_BEACON_PROTECTION))
+
+		if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) {
+			GENL_SET_ERR_MSG(info, "BIGTK not supported");
 			return -EINVAL;
+		}
 	}
 
 	if (info->attrs[NL80211_ATTR_MAC])
-- 
cgit v1.2.3


From 873b1cf61105a67f01f6fc3758405edb1bd1ba35 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@codeaurora.org>
Date: Tue, 21 Apr 2020 17:48:15 +0300
Subject: mac80211: Process multicast RX registration for Action frames

Convert a user space registration for processing multicast Action frames
(NL80211_CMD_REGISTER_FRAME with NL80211_ATTR_RECEIVE_MULTICAST) to a
new enum ieee80211_filter_flags bit FIF_MCAST_ACTION so that drivers can
update their RX filter parameters appropriately, if needed.

Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Link: https://lore.kernel.org/r/20200421144815.19175-1-jouni@codeaurora.org
[rename variables to rx_mcast_action_reg indicating action frames only]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  6 ++++++
 net/mac80211/cfg.c         | 14 +++++++++++---
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/main.c        |  3 +++
 4 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f6dc5a38720f..f12fe3b0a868 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1620,6 +1620,8 @@ enum ieee80211_vif_flags {
  *	monitor interface (if that is requested.)
  * @probe_req_reg: probe requests should be reported to mac80211 for this
  *	interface.
+ * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211
+ *	for this interface.
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void \*).
  * @txq: the multicast data TX queue (if driver uses the TXQ abstraction)
@@ -1648,6 +1650,7 @@ struct ieee80211_vif {
 #endif
 
 	bool probe_req_reg;
+	bool rx_mcast_action_reg;
 
 	bool txqs_stopped[IEEE80211_NUM_ACS];
 
@@ -3091,6 +3094,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * @FIF_PSPOLL: pass PS Poll frames
  *
  * @FIF_PROBE_REQ: pass probe request frames
+ *
+ * @FIF_MCAST_ACTION: pass multicast Action frames
  */
 enum ieee80211_filter_flags {
 	FIF_ALLMULTI		= 1<<1,
@@ -3101,6 +3106,7 @@ enum ieee80211_filter_flags {
 	FIF_OTHER_BSS		= 1<<6,
 	FIF_PSPOLL		= 1<<7,
 	FIF_PROBE_REQ		= 1<<8,
+	FIF_MCAST_ACTION	= 1<<9,
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e62b4764e82e..f0d43b9cfa43 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3406,15 +3406,23 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
 	u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4);
+	u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4);
 	bool global_change, intf_change;
 
 	global_change =
-		local->probe_req_reg != !!(upd->global_stypes & preq_mask);
+		(local->probe_req_reg != !!(upd->global_stypes & preq_mask)) ||
+		(local->rx_mcast_action_reg !=
+		 !!(upd->global_mcast_stypes & action_mask));
 	local->probe_req_reg = upd->global_stypes & preq_mask;
+	local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask;
 
-	intf_change = sdata->vif.probe_req_reg !=
-				!!(upd->interface_stypes & preq_mask);
+	intf_change = (sdata->vif.probe_req_reg !=
+		       !!(upd->interface_stypes & preq_mask)) ||
+		(sdata->vif.rx_mcast_action_reg !=
+		 !!(upd->interface_mcast_stypes & action_mask));
 	sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask;
+	sdata->vif.rx_mcast_action_reg =
+		upd->interface_mcast_stypes & action_mask;
 
 	if (!local->open_count)
 		return;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index da41ee996d3d..9407cf44305c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1168,6 +1168,7 @@ struct ieee80211_local {
 	int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
 	    fif_probe_req;
 	bool probe_req_reg;
+	bool rx_mcast_action_reg;
 	unsigned int filter_flags; /* FIF_* */
 
 	bool wiphy_ciphers_allocated;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 0e9ad60fb2b3..a0cb052ea30d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -64,6 +64,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local)
 	if (local->fif_pspoll)
 		new_flags |= FIF_PSPOLL;
 
+	if (local->rx_mcast_action_reg)
+		new_flags |= FIF_MCAST_ACTION;
+
 	spin_lock_bh(&local->filter_lock);
 	changed_flags = local->filter_flags ^ new_flags;
 
-- 
cgit v1.2.3


From 9166cc49767a646990a73380480356416b7794eb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 26 Mar 2020 15:09:32 +0200
Subject: mac80211: implement Operating Mode Notification extended NSS support

Somehow we missed this for a long time, but similar to the extended
NSS support in VHT capabilities, we need to have this in Operating
Mode notification.

Implement it by
 * parsing the 160/80+80 bit there and setting the bandwidth
   appropriately
 * having callers of ieee80211_get_vht_max_nss() pass in the current
   max NSS value as received in the operating mode notification in
   order to modify it appropriately depending on the extended NSS
   bits.

This updates all drivers that use it, i.e. only iwlwifi/mvm.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.098483728cfa.I4e8c25d3288441759c2793247197229f0696a37d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c |  6 +++---
 include/linux/ieee80211.h                   | 12 +++++++++---
 net/mac80211/vht.c                          | 10 ++++++++--
 net/wireless/util.c                         | 26 ++++++++++++++------------
 4 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index c1aba2bf73cf..a8c13f6fbce0 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -1,10 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /******************************************************************************
  *
- * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2005 - 2014, 2018 - 2020 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright(c) 2018 - 2019 Intel Corporation
  *
  * Contact Information:
  *  Intel Linux Wireless <linuxwifi@intel.com>
@@ -1430,7 +1429,8 @@ static u32 rs_bw_from_sta_bw(struct ieee80211_sta *sta)
 		 */
 		if (ieee80211_get_vht_max_nss(&vht_cap,
 					      IEEE80211_VHT_CHANWIDTH_160MHZ,
-					      0, true) < sta->rx_nss)
+					      0, true,
+					      sta->rx_nss) < sta->rx_nss)
 			return RATE_MCS_CHAN_WIDTH_80;
 		return RATE_MCS_CHAN_WIDTH_160;
 	case IEEE80211_STA_RX_BW_80:
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 16268ef1cbcc..c326aec535c6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2019 Intel Corporation
+ * Copyright (c) 2018 - 2020 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -859,6 +859,7 @@ enum ieee80211_ht_chanwidth_values {
  * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width
  * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width
  * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width
+ * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag
  * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask
  *	(the NSS value is the value of this field + 1)
  * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift
@@ -866,11 +867,12 @@ enum ieee80211_ht_chanwidth_values {
  *	using a beamforming steering matrix
  */
 enum ieee80211_vht_opmode_bits {
-	IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK	= 3,
+	IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK	= 0x03,
 	IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ	= 0,
 	IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ	= 1,
 	IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ	= 2,
 	IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ	= 3,
+	IEEE80211_OPMODE_NOTIF_BW_160_80P80	= 0x04,
 	IEEE80211_OPMODE_NOTIF_RX_NSS_MASK	= 0x70,
 	IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT	= 4,
 	IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF	= 0x80,
@@ -1731,6 +1733,9 @@ struct ieee80211_mu_edca_param_set {
  * @ext_nss_bw_capable: indicates whether or not the local transmitter
  *	(rate scaling algorithm) can deal with the new logic
  *	(dot11VHTExtendedNSSBWCapable)
+ * @max_vht_nss: current maximum NSS as advertised by the STA in
+ *	operating mode notification, can be 0 in which case the
+ *	capability data will be used to derive this (from MCS support)
  *
  * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
  * vary for a given BW/MCS. This function parses the data.
@@ -1739,7 +1744,8 @@ struct ieee80211_mu_edca_param_set {
  */
 int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 			      enum ieee80211_vht_chanwidth bw,
-			      int mcs, bool ext_nss_bw_capable);
+			      int mcs, bool ext_nss_bw_capable,
+			      unsigned int max_vht_nss);
 
 /* 802.11ax HE MAC capabilities */
 #define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 632f07401850..9c6045f9c24d 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -4,7 +4,7 @@
  *
  * Portions of this file
  * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -575,15 +575,21 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 
 	switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
+		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
 		sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ:
+		/* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */
 		sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ:
-		sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+		if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80)
+			sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+		else
+			sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
 		break;
 	case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ:
+		/* legacy only, no longer used by newer spec */
 		sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
 		break;
 	}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 6590efbbcbb9..123d6ce79b8e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -2030,10 +2030,10 @@ EXPORT_SYMBOL(cfg80211_send_layer2_update);
 
 int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 			      enum ieee80211_vht_chanwidth bw,
-			      int mcs, bool ext_nss_bw_capable)
+			      int mcs, bool ext_nss_bw_capable,
+			      unsigned int max_vht_nss)
 {
 	u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map);
-	int max_vht_nss = 0;
 	int ext_nss_bw;
 	int supp_width;
 	int i, mcs_encoding;
@@ -2041,7 +2041,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 	if (map == 0xffff)
 		return 0;
 
-	if (WARN_ON(mcs > 9))
+	if (WARN_ON(mcs > 9 || max_vht_nss > 8))
 		return 0;
 	if (mcs <= 7)
 		mcs_encoding = 0;
@@ -2050,16 +2050,18 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 	else
 		mcs_encoding = 2;
 
-	/* find max_vht_nss for the given MCS */
-	for (i = 7; i >= 0; i--) {
-		int supp = (map >> (2 * i)) & 3;
+	if (!max_vht_nss) {
+		/* find max_vht_nss for the given MCS */
+		for (i = 7; i >= 0; i--) {
+			int supp = (map >> (2 * i)) & 3;
 
-		if (supp == 3)
-			continue;
+			if (supp == 3)
+				continue;
 
-		if (supp >= mcs_encoding) {
-			max_vht_nss = i + 1;
-			break;
+			if (supp >= mcs_encoding) {
+				max_vht_nss = i + 1;
+				break;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From d46b4ab870fa29445b701e922e9aa36b15f833ea Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Thu, 26 Mar 2020 15:09:33 +0200
Subject: mac80211: add twt_protected flag to the bss_conf structure

Add a flag to the BSS conf whether the BSS and STA support protected TWT.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.1dcb2d16fa74.I74d7c007dad2601d2e39f54612fe6554dd5ab386@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/mlme.c    | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f12fe3b0a868..5fb80dd8bbbc 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -508,6 +508,7 @@ struct ieee80211_ftm_responder_params {
  *	mode only, set if the AP advertises TWT responder role)
  * @twt_responder: does this BSS support TWT requester (relevant for managed
  *	mode only, set if the AP advertises TWT responder role)
+ * @twt_protected: does this BSS support protected TWT frames
  * @assoc: association status
  * @ibss_joined: indicates whether this station is part of an IBSS
  *	or not
@@ -618,6 +619,7 @@ struct ieee80211_bss_conf {
 	bool he_support;
 	bool twt_requester;
 	bool twt_responder;
+	bool twt_protected;
 	/* association related data */
 	bool assoc, ibss_joined;
 	bool ibss_creator;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7139335f29c0..b77787995723 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3384,10 +3384,19 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 						  sta);
 
 		bss_conf->he_support = sta->sta.he_cap.has_he;
+		if (elems->rsnx && elems->rsnx_len &&
+		    (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) &&
+		    wiphy_ext_feature_isset(local->hw.wiphy,
+					    NL80211_EXT_FEATURE_PROTECTED_TWT))
+			bss_conf->twt_protected = true;
+		else
+			bss_conf->twt_protected = false;
+
 		changed |= ieee80211_recalc_twt_req(sdata, sta, elems);
 	} else {
 		bss_conf->he_support = false;
 		bss_conf->twt_requester = false;
+		bss_conf->twt_protected = false;
 	}
 
 	if (bss_conf->he_support) {
-- 
cgit v1.2.3


From a4055e74a2ff7c70ccdb6c36254ad5181464f211 Mon Sep 17 00:00:00 2001
From: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Date: Thu, 26 Mar 2020 15:09:34 +0200
Subject: mac80211: Don't destroy auth data in case of anti-clogging

SAE AP may reject authentication with WLAN_STATUS_ANTI_CLOG_REQUIRED.
As the user space will immediately continue the authentication flow,
there is no need to destroy the authentication data in this case.
This saves unneeded station removal and releasing the channel.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.7483996157a8.I8040a842874aaf6d209df3fc8a2acb97a0bf508b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b77787995723..56d61bc9954d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2948,10 +2948,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (status_code != WLAN_STATUS_SUCCESS) {
+		cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
+
+		if (auth_alg == WLAN_AUTH_SAE &&
+		    status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED)
+			return;
+
 		sdata_info(sdata, "%pM denied authentication (status %d)\n",
 			   mgmt->sa, status_code);
 		ieee80211_destroy_auth_data(sdata, false);
-		cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
 		event.u.mlme.status = MLME_DENIED;
 		event.u.mlme.reason = status_code;
 		drv_event_callback(sdata->local, sdata, &event);
-- 
cgit v1.2.3


From 2a392596d8811c6d58c014ec881b159c75a0cf45 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 26 Mar 2020 15:09:35 +0200
Subject: cfg80211: Parse HE membership selector

This extends the support for drivers that rebuilds IEs in the
FW (same as with HT/VHT).

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.20feaabfb484.I886252639604c8e3e84b8ef97962f1b0e4beec81@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 include/net/cfg80211.h    | 3 ++-
 net/wireless/nl80211.c    | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c326aec535c6..38f513ce7528 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1067,6 +1067,7 @@ struct ieee80211_mgmt {
 /* Supported rates membership selectors */
 #define BSS_MEMBERSHIP_SELECTOR_HT_PHY	127
 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY	126
+#define BSS_MEMBERSHIP_SELECTOR_HE_PHY	122
 
 /* mgmt header + 1 byte category code */
 #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index dbb9675fe38f..e288fdcb3df2 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1054,6 +1054,7 @@ enum cfg80211_ap_settings_flags {
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
  * @twt_responder: Enable Target Wait Time
+ * @he_required: stations must support HE
  * @flags: flags, as defined in enum cfg80211_ap_settings_flags
  * @he_obss_pd: OBSS Packet Detection settings
  * @he_bss_color: BSS Color settings
@@ -1083,7 +1084,7 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_vht_cap *vht_cap;
 	const struct ieee80211_he_cap_elem *he_cap;
 	const struct ieee80211_he_operation *he_oper;
-	bool ht_required, vht_required;
+	bool ht_required, vht_required, he_required;
 	bool twt_responder;
 	u32 flags;
 	struct ieee80211_he_obss_pd he_obss_pd;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d470d77d2eb6..3d27b24c68b2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4738,6 +4738,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
 			params->ht_required = true;
 		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
 			params->vht_required = true;
+		if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY)
+			params->he_required = true;
 	}
 }
 
-- 
cgit v1.2.3


From 4826e721103acf42421304330cf48a642fa163bb Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 26 Mar 2020 15:09:36 +0200
Subject: mac80211: Skip entries with HE membership selector

When parsing supported rates IE.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.ed3e66f8c197.I93aad0e5ddb7ce79f05f8153922acb9aa5076d38@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 56d61bc9954d..c77f47b41356 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3154,15 +3154,16 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
 			*have_higher_than_11mbit = true;
 
 		/*
-		 * Skip HT and VHT BSS membership selectors since they're not
-		 * rates.
+		 * Skip HT, VHT and HE BSS membership selectors since they're
+		 * not rates.
 		 *
 		 * Note: Even though the membership selector and the basic
 		 *	 rate flag share the same bit, they are not exactly
 		 *	 the same.
 		 */
 		if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) ||
-		    supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY))
+		    supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) ||
+		    supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY))
 			continue;
 
 		for (j = 0; j < sband->n_bitrates; j++) {
-- 
cgit v1.2.3


From 31d8bb4e07f80935ee9bf599a9d99de7ca90fc5a Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Thu, 26 Mar 2020 15:09:37 +0200
Subject: mac80211: agg-tx: refactor sending addba

We move the actual arming the timer and sending ADDBA to a function
for the use in different places calling the same logic.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.58a337eb90a1.I75934e6464535fbf43969acc796bc886291e79a5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-tx.c | 67 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 33da6f738c99..32f40c4f3120 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
  * Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -448,6 +448,43 @@ static void sta_addba_resp_timer_expired(struct timer_list *t)
 	ieee80211_stop_tx_ba_session(&sta->sta, tid);
 }
 
+static void ieee80211_send_addba_with_timeout(struct sta_info *sta,
+					      struct tid_ampdu_tx *tid_tx)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sta->local;
+	u8 tid = tid_tx->tid;
+	u16 buf_size;
+
+	/* activate the timer for the recipient's addBA response */
+	mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
+	ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n",
+	       sta->sta.addr, tid);
+
+	spin_lock_bh(&sta->lock);
+	sta->ampdu_mlme.last_addba_req_time[tid] = jiffies;
+	sta->ampdu_mlme.addba_req_num[tid]++;
+	spin_unlock_bh(&sta->lock);
+
+	if (sta->sta.he_cap.has_he) {
+		buf_size = local->hw.max_tx_aggregation_subframes;
+	} else {
+		/*
+		 * We really should use what the driver told us it will
+		 * transmit as the maximum, but certain APs (e.g. the
+		 * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
+		 * will crash when we use a lower number.
+		 */
+		buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+	}
+
+	/* send AddBA request */
+	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
+				     tid_tx->dialog_token,
+				     sta->tid_seq[tid] >> 4,
+				     buf_size, tid_tx->timeout);
+}
+
 void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 {
 	struct tid_ampdu_tx *tid_tx;
@@ -462,7 +499,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 		.timeout = 0,
 	};
 	int ret;
-	u16 buf_size;
 
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
 
@@ -508,32 +544,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 		return;
 	}
 
-	/* activate the timer for the recipient's addBA response */
-	mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
-	ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n",
-	       sta->sta.addr, tid);
-
-	spin_lock_bh(&sta->lock);
-	sta->ampdu_mlme.last_addba_req_time[tid] = jiffies;
-	sta->ampdu_mlme.addba_req_num[tid]++;
-	spin_unlock_bh(&sta->lock);
-
-	if (sta->sta.he_cap.has_he) {
-		buf_size = local->hw.max_tx_aggregation_subframes;
-	} else {
-		/*
-		 * We really should use what the driver told us it will
-		 * transmit as the maximum, but certain APs (e.g. the
-		 * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
-		 * will crash when we use a lower number.
-		 */
-		buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
-	}
-
-	/* send AddBA request */
-	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
-				     tid_tx->dialog_token, params.ssn,
-				     buf_size, tid_tx->timeout);
+	ieee80211_send_addba_with_timeout(sta, tid_tx);
 }
 
 /*
-- 
cgit v1.2.3


From 0c197f16f7bc5ddb43073690a80fb15998ad61e4 Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Thu, 26 Mar 2020 15:09:38 +0200
Subject: mac80211: agg-tx: add an option to defer ADDBA transmit

Driver tells mac80211 to sends ADDBA with SSN (starting sequence number)
from the head of the queue, while the transmission of all the frames in the
queue may take a while, which causes the peer to time out. In order to
fix this scenario, add an option to defer ADDBA transmit until queue
is drained.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.0f27423fec75.If67daab123a27c1cbddef000d6a3f212aa6309ef@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  |  6 +++++-
 net/mac80211/agg-tx.c   | 12 +++++++++++-
 net/mac80211/sta_info.h |  2 ++
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5fb80dd8bbbc..f3147633dda2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3125,7 +3125,10 @@ enum ieee80211_filter_flags {
  * @IEEE80211_AMPDU_RX_START: start RX aggregation
  * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation
  * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either
- *	call ieee80211_start_tx_ba_cb_irqsafe() or return the special
+ *	call ieee80211_start_tx_ba_cb_irqsafe() or
+ *	call ieee80211_start_tx_ba_cb_irqsafe() with status
+ *	%IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after
+ *	ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special
  *	status %IEEE80211_AMPDU_TX_START_IMMEDIATE.
  * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational
  * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting
@@ -3151,6 +3154,7 @@ enum ieee80211_ampdu_mlme_action {
 };
 
 #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1
+#define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2
 
 /**
  * struct ieee80211_ampdu_params - AMPDU action parameters
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 32f40c4f3120..c2d5f512526d 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -483,6 +483,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta,
 				     tid_tx->dialog_token,
 				     sta->tid_seq[tid] >> 4,
 				     buf_size, tid_tx->timeout);
+
+	WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state));
 }
 
 void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
@@ -521,7 +523,9 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 
 	params.ssn = sta->tid_seq[tid] >> 4;
 	ret = drv_ampdu_action(local, sdata, &params);
-	if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) {
+	if (ret == IEEE80211_AMPDU_TX_START_DELAY_ADDBA) {
+		return;
+	} else if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) {
 		/*
 		 * We didn't send the request yet, so don't need to check
 		 * here if we already got a response, just mark as driver
@@ -765,6 +769,12 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
 	if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
 		return;
 
+	if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) {
+		ieee80211_send_addba_with_timeout(sta, tid_tx);
+		/* RESPONSE_RECEIVED state whould trigger the flow again */
+		return;
+	}
+
 	if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
 		ieee80211_agg_tx_operational(local, sta, tid);
 }
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 36f1abaab9ff..a5de3aa6ea42 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -3,6 +3,7 @@
  * Copyright 2002-2005, Devicescape Software, Inc.
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2015-2017 Intel Deutschland GmbH
+ * Copyright(c) 2020 Intel Corporation
  */
 
 #ifndef STA_INFO_H
@@ -116,6 +117,7 @@ enum ieee80211_sta_info_flags {
 #define HT_AGG_STATE_WANT_STOP		5
 #define HT_AGG_STATE_START_CB		6
 #define HT_AGG_STATE_STOP_CB		7
+#define HT_AGG_STATE_SENT_ADDBA		8
 
 DECLARE_EWMA(avg_signal, 10, 8)
 enum ieee80211_agg_stop_reason {
-- 
cgit v1.2.3


From 302ff8b7a2b01cfb7645f112bb259af1c146c57a Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 26 Mar 2020 15:09:39 +0200
Subject: mac80211: Fail association when AP has no legacy rates

The MLME logic had a workaround that allowed to continue an
association with an AP even if the AP did not provide any basic
rates in its supported rates in the association response, assuming
that the first (non basic) legacy rate could be used as a basic rate.
However, this did not consider the case where the AP (which is
obviously buggy) did not provide any legacy rate.

Fix this by failing the association, as this can result in
an unexpected failure in the low level driver and FW, e.g., in
rate scale logic etc.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.d70a1450d83f.I6e6ce5efda351a8544c0e7bfeee260fe3360d401@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c77f47b41356..59a35c7997c3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5036,8 +5036,16 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
 		 * doesn't happen any more, but keep the workaround so
 		 * in case some *other* APs are buggy in different ways
 		 * we can connect -- with a warning.
+		 * Allow this workaround only in case the AP provided at least
+		 * one rate.
 		 */
-		if (!basic_rates && min_rate_index >= 0) {
+		if (min_rate_index < 0) {
+			sdata_info(sdata,
+				   "No legacy rates in association response\n");
+
+			sta_info_free(local, new_sta);
+			return -EINVAL;
+		} else if (!basic_rates) {
 			sdata_info(sdata,
 				   "No basic rates, using min rate instead\n");
 			basic_rates = BIT(min_rate_index);
-- 
cgit v1.2.3


From dba25b04c61170cf8592f87df2bb086201047473 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 26 Mar 2020 15:09:40 +0200
Subject: mac80211: minstrel_ht_assign_best_tp_rates: remove redundant test

We know this pointer isn't NULL and in fact dereferenced it before,
remove the redundant test.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20200326150855.adf551928846.Iae9015573d6c350cc1b12a311d6d13d086beec6c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 694a31978a04..5547111d22bf 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org>
+ * Copyright (C) 2019-2020 Intel Corporation
  */
 #include <linux/netdevice.h>
 #include <linux/types.h>
@@ -490,7 +491,7 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi,
 	tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg;
 	tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
 
-	if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) {
+	if (tmp_cck_tp > tmp_mcs_tp) {
 		for(i = 0; i < MAX_THR_RATES; i++) {
 			minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i],
 						       tmp_mcs_tp_rate);
-- 
cgit v1.2.3


From 934f4c7dd3a544bb8000f7436f1f0e12e04ebc37 Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Wed, 1 Apr 2020 18:18:03 -0700
Subject: cfg80211: express channels with a KHz component

Some bands (S1G) define channels centered on a non-integer
MHz. Give ieee80211_channel and cfg80211_chan_def a
freq_offset component where the final frequency can be
expressed as:

MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset;

Also provide some helper functions to do the frequency
conversion and test for equality.

Retain the existing interface to frequency and channel
conversion helpers, and expose new ones which handle
frequencies in units of KHz.

Some internal functions (net/wireless/chan.c) pass around
a frequency value. Convert these to units of KHz.

mesh, ibss, wext, etc. are currently ignored.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200402011810.22947-3-thomas@adapt-ip.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 92 ++++++++++++++++++++++++++++++++++++++++++++++----
 net/wireless/chan.c    | 68 +++++++++++++++++++++----------------
 net/wireless/reg.c     | 40 +++++++++++-----------
 net/wireless/scan.c    |  4 +--
 net/wireless/trace.h   | 21 +++++++++---
 net/wireless/util.c    | 32 +++++++++++-------
 6 files changed, 182 insertions(+), 75 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e288fdcb3df2..a82fc59a1d82 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -128,6 +128,7 @@ enum ieee80211_channel_flags {
  * with cfg80211.
  *
  * @center_freq: center frequency in MHz
+ * @freq_offset: offset from @center_freq, in KHz
  * @hw_value: hardware-specific value for the channel
  * @flags: channel flags from &enum ieee80211_channel_flags.
  * @orig_flags: channel flags at registration time, used by regulatory
@@ -149,6 +150,7 @@ enum ieee80211_channel_flags {
 struct ieee80211_channel {
 	enum nl80211_band band;
 	u32 center_freq;
+	u16 freq_offset;
 	u16 hw_value;
 	u32 flags;
 	int max_antenna_gain;
@@ -617,6 +619,7 @@ struct key_params {
  *	If edmg is requested (i.e. the .channels member is non-zero),
  *	chan will define the primary channel and all other
  *	parameters are ignored.
+ * @freq1_offset: offset from @center_freq1, in KHz
  */
 struct cfg80211_chan_def {
 	struct ieee80211_channel *chan;
@@ -624,6 +627,7 @@ struct cfg80211_chan_def {
 	u32 center_freq1;
 	u32 center_freq2;
 	struct ieee80211_edmg edmg;
+	u16 freq1_offset;
 };
 
 /**
@@ -713,6 +717,7 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
 	return (chandef1->chan == chandef2->chan &&
 		chandef1->width == chandef2->width &&
 		chandef1->center_freq1 == chandef2->center_freq1 &&
+		chandef1->freq1_offset == chandef2->freq1_offset &&
 		chandef1->center_freq2 == chandef2->center_freq2);
 }
 
@@ -5177,30 +5182,92 @@ static inline void *wdev_priv(struct wireless_dev *wdev)
  * cfg80211 offers a number of utility functions that can be useful.
  */
 
+/**
+ * ieee80211_channel_equal - compare two struct ieee80211_channel
+ *
+ * @a: 1st struct ieee80211_channel
+ * @b: 2nd struct ieee80211_channel
+ * Return: true if center frequency of @a == @b
+ */
+static inline bool
+ieee80211_channel_equal(struct ieee80211_channel *a,
+			struct ieee80211_channel *b)
+{
+	return (a->center_freq == b->center_freq &&
+		a->freq_offset == b->freq_offset);
+}
+
+/**
+ * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz
+ * @chan: struct ieee80211_channel to convert
+ * Return: The corresponding frequency (in KHz)
+ */
+static inline u32
+ieee80211_channel_to_khz(const struct ieee80211_channel *chan)
+{
+	return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset;
+}
+
+/**
+ * ieee80211_channel_to_freq_khz - convert channel number to frequency
+ * @chan: channel number
+ * @band: band, necessary due to channel number overlap
+ * Return: The corresponding frequency (in KHz), or 0 if the conversion failed.
+ */
+u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band);
+
 /**
  * ieee80211_channel_to_frequency - convert channel number to frequency
  * @chan: channel number
  * @band: band, necessary due to channel number overlap
  * Return: The corresponding frequency (in MHz), or 0 if the conversion failed.
  */
-int ieee80211_channel_to_frequency(int chan, enum nl80211_band band);
+static inline int
+ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
+{
+	return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band));
+}
+
+/**
+ * ieee80211_freq_khz_to_channel - convert frequency to channel number
+ * @freq: center frequency in KHz
+ * Return: The corresponding channel, or 0 if the conversion failed.
+ */
+int ieee80211_freq_khz_to_channel(u32 freq);
 
 /**
  * ieee80211_frequency_to_channel - convert frequency to channel number
- * @freq: center frequency
+ * @freq: center frequency in MHz
  * Return: The corresponding channel, or 0 if the conversion failed.
  */
-int ieee80211_frequency_to_channel(int freq);
+static inline int
+ieee80211_frequency_to_channel(int freq)
+{
+	return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq));
+}
+
+/**
+ * ieee80211_get_channel_khz - get channel struct from wiphy for specified
+ * frequency
+ * @wiphy: the struct wiphy to get the channel for
+ * @freq: the center frequency (in KHz) of the channel
+ * Return: The channel struct from @wiphy at @freq.
+ */
+struct ieee80211_channel *
+ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq);
 
 /**
  * ieee80211_get_channel - get channel struct from wiphy for specified frequency
  *
  * @wiphy: the struct wiphy to get the channel for
- * @freq: the center frequency of the channel
- *
+ * @freq: the center frequency (in MHz) of the channel
  * Return: The channel struct from @wiphy at @freq.
  */
-struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq);
+static inline struct ieee80211_channel *
+ieee80211_get_channel(struct wiphy *wiphy, int freq)
+{
+	return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq));
+}
 
 /**
  * ieee80211_get_response_rate - get basic rate for a given rate
@@ -7228,6 +7295,19 @@ bool ieee80211_operating_class_to_band(u8 operating_class,
 bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 					  u8 *op_class);
 
+/**
+ * ieee80211_chandef_to_khz - convert chandef to frequency in KHz
+ *
+ * @chandef: the chandef to convert
+ *
+ * Returns the center frequency of chandef (1st segment) in KHz.
+ */
+static inline u32
+ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
+{
+	return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
+}
+
 /*
  * cfg80211_tdls_oper_request - request userspace to perform TDLS operation
  * @dev: the device on which the operation is requested
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index fcac5c6366e1..d60e50a3b910 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -27,6 +27,7 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
 		return;
 
 	chandef->chan = chan;
+	chandef->freq1_offset = chan->freq_offset;
 	chandef->center_freq2 = 0;
 	chandef->edmg.bw_config = 0;
 	chandef->edmg.channels = 0;
@@ -153,7 +154,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 	case NL80211_CHAN_WIDTH_10:
 	case NL80211_CHAN_WIDTH_20:
 	case NL80211_CHAN_WIDTH_20_NOHT:
-		if (chandef->center_freq1 != control_freq)
+		if (ieee80211_chandef_to_khz(chandef) !=
+		    ieee80211_channel_to_khz(chandef->chan))
 			return false;
 		if (chandef->center_freq2)
 			return false;
@@ -386,10 +388,11 @@ static u32 cfg80211_get_start_freq(u32 center_freq,
 {
 	u32 start_freq;
 
-	if (bandwidth <= 20)
+	bandwidth = MHZ_TO_KHZ(bandwidth);
+	if (bandwidth <= MHZ_TO_KHZ(20))
 		start_freq = center_freq;
 	else
-		start_freq = center_freq - bandwidth/2 + 10;
+		start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10);
 
 	return start_freq;
 }
@@ -399,10 +402,11 @@ static u32 cfg80211_get_end_freq(u32 center_freq,
 {
 	u32 end_freq;
 
-	if (bandwidth <= 20)
+	bandwidth = MHZ_TO_KHZ(bandwidth);
+	if (bandwidth <= MHZ_TO_KHZ(20))
 		end_freq = center_freq;
 	else
-		end_freq = center_freq + bandwidth/2 - 10;
+		end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10);
 
 	return end_freq;
 }
@@ -417,8 +421,8 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
 	start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
 	end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
 
-	for (freq = start_freq; freq <= end_freq; freq += 20) {
-		c = ieee80211_get_channel(wiphy, freq);
+	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+		c = ieee80211_get_channel_khz(wiphy, freq);
 		if (!c)
 			return -EINVAL;
 
@@ -449,8 +453,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 			return -EINVAL;
 
 		ret = cfg80211_get_chans_dfs_required(wiphy,
-						      chandef->center_freq1,
-						      width);
+					ieee80211_chandef_to_khz(chandef),
+					width);
 		if (ret < 0)
 			return ret;
 		else if (ret > 0)
@@ -460,8 +464,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 			return 0;
 
 		ret = cfg80211_get_chans_dfs_required(wiphy,
-						      chandef->center_freq2,
-						      width);
+					MHZ_TO_KHZ(chandef->center_freq2),
+					width);
 		if (ret < 0)
 			return ret;
 		else if (ret > 0)
@@ -503,8 +507,8 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy,
 	 * DFS_AVAILABLE). Return number of usable channels
 	 * (require CAC). Allow DFS and non-DFS channel mix.
 	 */
-	for (freq = start_freq; freq <= end_freq; freq += 20) {
-		c = ieee80211_get_channel(wiphy, freq);
+	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+		c = ieee80211_get_channel_khz(wiphy, freq);
 		if (!c)
 			return -EINVAL;
 
@@ -536,8 +540,9 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
 	if (width < 0)
 		return false;
 
-	r1 = cfg80211_get_chans_dfs_usable(wiphy, chandef->center_freq1,
-					  width);
+	r1 = cfg80211_get_chans_dfs_usable(wiphy,
+					   MHZ_TO_KHZ(chandef->center_freq1),
+					   width);
 
 	if (r1 < 0)
 		return false;
@@ -546,8 +551,8 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
 	case NL80211_CHAN_WIDTH_80P80:
 		WARN_ON(!chandef->center_freq2);
 		r2 = cfg80211_get_chans_dfs_usable(wiphy,
-						   chandef->center_freq2,
-						   width);
+					MHZ_TO_KHZ(chandef->center_freq2),
+					width);
 		if (r2 < 0)
 			return false;
 		break;
@@ -694,8 +699,8 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
 	 * If any channel in between is disabled or has not
 	 * had gone through CAC return false
 	 */
-	for (freq = start_freq; freq <= end_freq; freq += 20) {
-		c = ieee80211_get_channel(wiphy, freq);
+	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+		c = ieee80211_get_channel_khz(wiphy, freq);
 		if (!c)
 			return false;
 
@@ -724,7 +729,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
 	if (width < 0)
 		return false;
 
-	r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq1,
+	r = cfg80211_get_chans_dfs_available(wiphy,
+					     MHZ_TO_KHZ(chandef->center_freq1),
 					     width);
 
 	/* If any of channels unavailable for cf1 just return */
@@ -735,8 +741,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
 	case NL80211_CHAN_WIDTH_80P80:
 		WARN_ON(!chandef->center_freq2);
 		r = cfg80211_get_chans_dfs_available(wiphy,
-						     chandef->center_freq2,
-						     width);
+					MHZ_TO_KHZ(chandef->center_freq2),
+					width);
 		break;
 	default:
 		WARN_ON(chandef->center_freq2);
@@ -757,8 +763,8 @@ static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy,
 	start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
 	end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
 
-	for (freq = start_freq; freq <= end_freq; freq += 20) {
-		c = ieee80211_get_channel(wiphy, freq);
+	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+		c = ieee80211_get_channel_khz(wiphy, freq);
 		if (!c)
 			return 0;
 
@@ -790,14 +796,14 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
 		return 0;
 
 	t1 = cfg80211_get_chans_dfs_cac_time(wiphy,
-					     chandef->center_freq1,
+					     MHZ_TO_KHZ(chandef->center_freq1),
 					     width);
 
 	if (!chandef->center_freq2)
 		return t1;
 
 	t2 = cfg80211_get_chans_dfs_cac_time(wiphy,
-					     chandef->center_freq2,
+					     MHZ_TO_KHZ(chandef->center_freq2),
 					     width);
 
 	return max(t1, t2);
@@ -813,8 +819,8 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 	start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
 	end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
 
-	for (freq = start_freq; freq <= end_freq; freq += 20) {
-		c = ieee80211_get_channel(wiphy, freq);
+	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+		c = ieee80211_get_channel_khz(wiphy, freq);
 		if (!c || c->flags & prohibited_flags)
 			return false;
 	}
@@ -976,13 +982,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
 
 
-	if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1,
+	if (!cfg80211_secondary_chans_ok(wiphy,
+					 ieee80211_chandef_to_khz(chandef),
 					 width, prohibited_flags))
 		return false;
 
 	if (!chandef->center_freq2)
 		return true;
-	return cfg80211_secondary_chans_ok(wiphy, chandef->center_freq2,
+	return cfg80211_secondary_chans_ok(wiphy,
+					   MHZ_TO_KHZ(chandef->center_freq2),
 					   width, prohibited_flags);
 }
 EXPORT_SYMBOL(cfg80211_chandef_usable);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d476d4da0d09..0d74a31ef0ab 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1658,22 +1658,23 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
 					  const struct ieee80211_channel *chan)
 {
 	const struct ieee80211_freq_range *freq_range = NULL;
-	u32 max_bandwidth_khz, bw_flags = 0;
+	u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0;
 
 	freq_range = &reg_rule->freq_range;
 
 	max_bandwidth_khz = freq_range->max_bandwidth_khz;
+	center_freq_khz = ieee80211_channel_to_khz(chan);
 	/* Check if auto calculation requested */
 	if (reg_rule->flags & NL80211_RRF_AUTO_BW)
 		max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
 
 	/* If we get a reg_rule we can assume that at least 5Mhz fit */
 	if (!cfg80211_does_bw_fit_range(freq_range,
-					MHZ_TO_KHZ(chan->center_freq),
+					center_freq_khz,
 					MHZ_TO_KHZ(10)))
 		bw_flags |= IEEE80211_CHAN_NO_10MHZ;
 	if (!cfg80211_does_bw_fit_range(freq_range,
-					MHZ_TO_KHZ(chan->center_freq),
+					center_freq_khz,
 					MHZ_TO_KHZ(20)))
 		bw_flags |= IEEE80211_CHAN_NO_20MHZ;
 
@@ -1710,7 +1711,7 @@ static void handle_channel(struct wiphy *wiphy,
 
 	flags = chan->orig_flags;
 
-	reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq));
+	reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan));
 	if (IS_ERR(reg_rule)) {
 		/*
 		 * We will disable all channels that do not match our
@@ -1729,13 +1730,13 @@ static void handle_channel(struct wiphy *wiphy,
 		if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
 		    request_wiphy && request_wiphy == wiphy &&
 		    request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
-			pr_debug("Disabling freq %d MHz for good\n",
-				 chan->center_freq);
+			pr_debug("Disabling freq %d.%03d MHz for good\n",
+				 chan->center_freq, chan->freq_offset);
 			chan->orig_flags |= IEEE80211_CHAN_DISABLED;
 			chan->flags = chan->orig_flags;
 		} else {
-			pr_debug("Disabling freq %d MHz\n",
-				 chan->center_freq);
+			pr_debug("Disabling freq %d.%03d MHz\n",
+				 chan->center_freq, chan->freq_offset);
 			chan->flags |= IEEE80211_CHAN_DISABLED;
 		}
 		return;
@@ -1936,7 +1937,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
 	sband = wiphy->bands[reg_beacon->chan.band];
 	chan = &sband->channels[chan_idx];
 
-	if (likely(chan->center_freq != reg_beacon->chan.center_freq))
+	if (likely(!ieee80211_channel_equal(chan, &reg_beacon->chan)))
 		return;
 
 	if (chan->beacon_found)
@@ -2269,18 +2270,18 @@ static void handle_channel_custom(struct wiphy *wiphy,
 	u32 bw_flags = 0;
 	const struct ieee80211_reg_rule *reg_rule = NULL;
 	const struct ieee80211_power_rule *power_rule = NULL;
-	u32 bw;
+	u32 bw, center_freq_khz;
 
+	center_freq_khz = ieee80211_channel_to_khz(chan);
 	for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
-		reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq),
-					      regd, bw);
+		reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw);
 		if (!IS_ERR(reg_rule))
 			break;
 	}
 
 	if (IS_ERR_OR_NULL(reg_rule)) {
-		pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
-			 chan->center_freq);
+		pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n",
+			 chan->center_freq, chan->freq_offset);
 		if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
 			chan->flags |= IEEE80211_CHAN_DISABLED;
 		} else {
@@ -3337,8 +3338,8 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan)
 	struct reg_beacon *pending_beacon;
 
 	list_for_each_entry(pending_beacon, &reg_pending_beacons, list)
-		if (beacon_chan->center_freq ==
-		    pending_beacon->chan.center_freq)
+		if (ieee80211_channel_equal(beacon_chan,
+					    &pending_beacon->chan))
 			return true;
 	return false;
 }
@@ -3367,9 +3368,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
 	if (!reg_beacon)
 		return -ENOMEM;
 
-	pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
-		 beacon_chan->center_freq,
-		 ieee80211_frequency_to_channel(beacon_chan->center_freq),
+	pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n",
+		 beacon_chan->center_freq, beacon_chan->freq_offset,
+		 ieee80211_freq_khz_to_channel(
+			 ieee80211_channel_to_khz(beacon_chan)),
 		 wiphy_name(wiphy));
 
 	memcpy(&reg_beacon->chan, beacon_chan,
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 4000382aef48..74ea4cfb39fb 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1322,8 +1322,8 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 		return channel;
 	}
 
-	freq = ieee80211_channel_to_frequency(channel_number, channel->band);
-	alt_channel = ieee80211_get_channel(wiphy, freq);
+	freq = ieee80211_channel_to_freq_khz(channel_number, channel->band);
+	alt_channel = ieee80211_get_channel_khz(wiphy, freq);
 	if (!alt_channel) {
 		if (channel->band == NL80211_BAND_2GHZ) {
 			/*
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ee736620f1e3..53c887ea67c7 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -112,24 +112,29 @@
 	} while (0)
 
 #define CHAN_ENTRY __field(enum nl80211_band, band) \
-		   __field(u32, center_freq)
+		   __field(u32, center_freq)		\
+		   __field(u16, freq_offset)
 #define CHAN_ASSIGN(chan)					  \
 	do {							  \
 		if (chan) {					  \
 			__entry->band = chan->band;		  \
 			__entry->center_freq = chan->center_freq; \
+			__entry->freq_offset = chan->freq_offset; \
 		} else {					  \
 			__entry->band = 0;			  \
 			__entry->center_freq = 0;		  \
+			__entry->freq_offset = 0;		  \
 		}						  \
 	} while (0)
-#define CHAN_PR_FMT "band: %d, freq: %u"
-#define CHAN_PR_ARG __entry->band, __entry->center_freq
+#define CHAN_PR_FMT "band: %d, freq: %u.%03u"
+#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset
 
 #define CHAN_DEF_ENTRY __field(enum nl80211_band, band)		\
 		       __field(u32, control_freq)			\
+		       __field(u32, freq_offset)			\
 		       __field(u32, width)				\
 		       __field(u32, center_freq1)			\
+		       __field(u32, freq1_offset)			\
 		       __field(u32, center_freq2)
 #define CHAN_DEF_ASSIGN(chandef)					\
 	do {								\
@@ -137,21 +142,27 @@
 			__entry->band = (chandef)->chan->band;		\
 			__entry->control_freq =				\
 				(chandef)->chan->center_freq;		\
+			__entry->freq_offset =				\
+				(chandef)->chan->freq_offset;		\
 			__entry->width = (chandef)->width;		\
 			__entry->center_freq1 = (chandef)->center_freq1;\
+			__entry->freq1_offset = (chandef)->freq1_offset;\
 			__entry->center_freq2 = (chandef)->center_freq2;\
 		} else {						\
 			__entry->band = 0;				\
 			__entry->control_freq = 0;			\
+			__entry->freq_offset = 0;			\
 			__entry->width = 0;				\
 			__entry->center_freq1 = 0;			\
+			__entry->freq1_offset = 0;			\
 			__entry->center_freq2 = 0;			\
 		}							\
 	} while (0)
 #define CHAN_DEF_PR_FMT							\
-	"band: %d, control freq: %u, width: %d, cf1: %u, cf2: %u"
+	"band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u"
 #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq,		\
-			__entry->width, __entry->center_freq1,		\
+			__entry->freq_offset, __entry->width,		\
+			__entry->center_freq1, __entry->freq1_offset,	\
 			__entry->center_freq2
 
 #define SINFO_ENTRY __field(int, generation)	    \
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 123d6ce79b8e..df75e58eca5d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -72,7 +72,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
 }
 EXPORT_SYMBOL(ieee80211_mandatory_rates);
 
-int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
+u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
 {
 	/* see 802.11 17.3.8.3.2 and Annex J
 	 * there are overlapping channel numbers in 5GHz and 2GHz bands */
@@ -81,15 +81,15 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
 	switch (band) {
 	case NL80211_BAND_2GHZ:
 		if (chan == 14)
-			return 2484;
+			return MHZ_TO_KHZ(2484);
 		else if (chan < 14)
-			return 2407 + chan * 5;
+			return MHZ_TO_KHZ(2407 + chan * 5);
 		break;
 	case NL80211_BAND_5GHZ:
 		if (chan >= 182 && chan <= 196)
-			return 4000 + chan * 5;
+			return MHZ_TO_KHZ(4000 + chan * 5);
 		else
-			return 5000 + chan * 5;
+			return MHZ_TO_KHZ(5000 + chan * 5);
 		break;
 	case NL80211_BAND_6GHZ:
 		/* see 802.11ax D4.1 27.3.22.2 */
@@ -98,17 +98,20 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
 		break;
 	case NL80211_BAND_60GHZ:
 		if (chan < 7)
-			return 56160 + chan * 2160;
+			return MHZ_TO_KHZ(56160 + chan * 2160);
 		break;
 	default:
 		;
 	}
 	return 0; /* not supported */
 }
-EXPORT_SYMBOL(ieee80211_channel_to_frequency);
+EXPORT_SYMBOL(ieee80211_channel_to_freq_khz);
 
-int ieee80211_frequency_to_channel(int freq)
+int ieee80211_freq_khz_to_channel(u32 freq)
 {
+	/* TODO: just handle MHz for now */
+	freq = KHZ_TO_MHZ(freq);
+
 	/* see 802.11 17.3.8.3.2 and Annex J */
 	if (freq == 2484)
 		return 14;
@@ -126,9 +129,10 @@ int ieee80211_frequency_to_channel(int freq)
 	else
 		return 0;
 }
-EXPORT_SYMBOL(ieee80211_frequency_to_channel);
+EXPORT_SYMBOL(ieee80211_freq_khz_to_channel);
 
-struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
+struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy,
+						    u32 freq)
 {
 	enum nl80211_band band;
 	struct ieee80211_supported_band *sband;
@@ -141,14 +145,16 @@ struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq)
 			continue;
 
 		for (i = 0; i < sband->n_channels; i++) {
-			if (sband->channels[i].center_freq == freq)
-				return &sband->channels[i];
+			struct ieee80211_channel *chan = &sband->channels[i];
+
+			if (ieee80211_channel_to_khz(chan) == freq)
+				return chan;
 		}
 	}
 
 	return NULL;
 }
-EXPORT_SYMBOL(ieee80211_get_channel);
+EXPORT_SYMBOL(ieee80211_get_channel_khz);
 
 static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
 {
-- 
cgit v1.2.3


From b6011960f392d1de619f10aa5d088c27f1e7526c Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Wed, 1 Apr 2020 18:18:04 -0700
Subject: mac80211: handle channel frequency offset

cfg80211_chan_def and ieee80211_channel recently gained a
frequency offset component. Handle this where it makes
sense (potentially required by S1G channels).

For IBSS, TDLS, CSA, and ROC we return -EOPNOTSUPP if a
channel with frequency offset is passed, since they may or
may not work. Once someone tests and verifies these
commands work on thos types of channels, we can remove
that error.

join_ocb and join_mesh look harmless because they use a
simple ieee80211_vif_use_channel(), which is using an
already verified channel, so we let those through.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200402011810.22947-4-thomas@adapt-ip.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c        |  6 ++++++
 net/mac80211/chan.c       |  1 +
 net/mac80211/ibss.c       |  5 +++++
 net/mac80211/main.c       |  8 +++++---
 net/mac80211/mlme.c       | 16 ++++++++++++----
 net/mac80211/offchannel.c |  4 ++++
 net/mac80211/scan.c       |  1 +
 net/mac80211/tdls.c       |  4 ++++
 net/mac80211/trace.h      | 41 +++++++++++++++++++++++++++++++++--------
 9 files changed, 71 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f0d43b9cfa43..ae3e06375a28 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3287,6 +3287,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 		goto out;
 	}
 
+	if (params->chandef.chan->freq_offset) {
+		/* this may work, but is untested */
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	chanctx = container_of(conf, struct ieee80211_chanctx, conf);
 
 	ch_switch.timestamp = 0;
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 9c94baaf693c..e6e192f53e4e 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -533,6 +533,7 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local,
 		struct cfg80211_chan_def *chandef = &local->_oper_chandef;
 		chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
 		chandef->center_freq1 = chandef->chan->center_freq;
+		chandef->freq1_offset = chandef->chan->freq_offset;
 		chandef->center_freq2 = 0;
 
 		/* NOTE: Disabling radar is only valid here for
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index d40744903fa9..2479cd48fed0 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1758,6 +1758,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 	int i;
 	int ret;
 
+	if (params->chandef.chan->freq_offset) {
+		/* this may work, but is untested */
+		return -EOPNOTSUPP;
+	}
+
 	ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
 					    &params->chandef,
 					    sdata->wdev.iftype);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index a0cb052ea30d..dfcee5e462da 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -107,13 +107,15 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 		chandef.chan = local->tmp_channel;
 		chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
 		chandef.center_freq1 = chandef.chan->center_freq;
+		chandef.freq1_offset = chandef.chan->freq_offset;
 	} else
 		chandef = local->_oper_chandef;
 
 	WARN(!cfg80211_chandef_valid(&chandef),
-	     "control:%d MHz width:%d center: %d/%d MHz",
-	     chandef.chan->center_freq, chandef.width,
-	     chandef.center_freq1, chandef.center_freq2);
+	     "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz",
+	     chandef.chan->center_freq, chandef.chan->freq_offset,
+	     chandef.width, chandef.center_freq1, chandef.freq1_offset,
+	     chandef.center_freq2);
 
 	if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef))
 		local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 59a35c7997c3..acc8adf50d69 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -162,6 +162,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	chandef->chan = channel;
 	chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
 	chandef->center_freq1 = channel->center_freq;
+	chandef->freq1_offset = channel->freq_offset;
 
 	if (!ht_oper || !sta_ht_cap.ht_supported) {
 		ret = IEEE80211_STA_DISABLE_HT |
@@ -396,9 +397,12 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 		return 0;
 
 	sdata_info(sdata,
-		   "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n",
-		   ifmgd->bssid, chandef.chan->center_freq, chandef.width,
-		   chandef.center_freq1, chandef.center_freq2);
+		   "AP %pM changed bandwidth, new config is %d.%03d MHz, "
+		   "width %d (%d.%03d/%d MHz)\n",
+		   ifmgd->bssid, chandef.chan->center_freq,
+		   chandef.chan->freq_offset, chandef.width,
+		   chandef.center_freq1, chandef.freq1_offset,
+		   chandef.center_freq2);
 
 	if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT |
 				      IEEE80211_STA_DISABLE_VHT |
@@ -1364,10 +1368,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef,
 				     IEEE80211_CHAN_DISABLED)) {
 		sdata_info(sdata,
-			   "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+			   "AP %pM switches to unsupported channel "
+			   "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), "
+			   "disconnecting\n",
 			   ifmgd->associated->bssid,
 			   csa_ie.chandef.chan->center_freq,
+			   csa_ie.chandef.chan->freq_offset,
 			   csa_ie.chandef.width, csa_ie.chandef.center_freq1,
+			   csa_ie.chandef.freq1_offset,
 			   csa_ie.chandef.center_freq2);
 		ieee80211_queue_work(&local->hw,
 				     &ifmgd->csa_connection_drop_work);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index c710504ccf1a..db3b8bf75656 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -557,6 +557,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
 
 	lockdep_assert_held(&local->mtx);
 
+	if (channel->freq_offset)
+		/* this may work, but is untested */
+		return -EOPNOTSUPP;
+
 	if (local->use_chanctx && !local->ops->remain_on_channel)
 		return -EOPNOTSUPP;
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index fdac8192a519..4d14118dddca 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -896,6 +896,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
 
 	local->scan_chandef.chan = chan;
 	local->scan_chandef.center_freq1 = chan->center_freq;
+	local->scan_chandef.freq1_offset = chan->freq_offset;
 	local->scan_chandef.center_freq2 = 0;
 	switch (scan_req->scan_width) {
 	case NL80211_BSS_CHAN_WIDTH_5:
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 7ff22f9d6e80..8ad420db3766 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1566,6 +1566,10 @@ ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	u32 ch_sw_tm_ie;
 	int ret;
 
+	if (chandef->chan->freq_offset)
+		/* this may work, but is untested */
+		return -EOPNOTSUPP;
+
 	mutex_lock(&local->sta_mtx);
 	sta = sta_info_get(sdata, addr);
 	if (!sta) {
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 427f51a0a994..1b4709694d2a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -37,32 +37,42 @@
 #define VIF_PR_ARG	__get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
 
 #define CHANDEF_ENTRY	__field(u32, control_freq)					\
+			__field(u32, freq_offset)					\
 			__field(u32, chan_width)					\
 			__field(u32, center_freq1)					\
+			__field(u32, freq1_offset)					\
 			__field(u32, center_freq2)
 #define CHANDEF_ASSIGN(c)							\
 			__entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0;	\
+			__entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0;	\
 			__entry->chan_width = (c) ? (c)->width : 0;			\
 			__entry->center_freq1 = (c) ? (c)->center_freq1 : 0;		\
+			__entry->freq1_offset = (c) ? (c)->freq1_offset : 0;		\
 			__entry->center_freq2 = (c) ? (c)->center_freq2 : 0;
-#define CHANDEF_PR_FMT	" control:%d MHz width:%d center: %d/%d MHz"
-#define CHANDEF_PR_ARG	__entry->control_freq, __entry->chan_width,			\
-			__entry->center_freq1, __entry->center_freq2
+#define CHANDEF_PR_FMT	" control:%d.%03d MHz width:%d center: %d.%03d/%d MHz"
+#define CHANDEF_PR_ARG	__entry->control_freq, __entry->freq_offset, __entry->chan_width, \
+			__entry->center_freq1, __entry->freq1_offset, __entry->center_freq2
 
 #define MIN_CHANDEF_ENTRY								\
 			__field(u32, min_control_freq)					\
+			__field(u32, min_freq_offset)					\
 			__field(u32, min_chan_width)					\
 			__field(u32, min_center_freq1)					\
+			__field(u32, min_freq1_offset)					\
 			__field(u32, min_center_freq2)
 
 #define MIN_CHANDEF_ASSIGN(c)								\
 			__entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0;	\
+			__entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;	\
 			__entry->min_chan_width = (c)->width;				\
 			__entry->min_center_freq1 = (c)->center_freq1;			\
+			__entry->freq1_offset = (c)->freq1_offset;			\
 			__entry->min_center_freq2 = (c)->center_freq2;
-#define MIN_CHANDEF_PR_FMT	" min_control:%d MHz min_width:%d min_center: %d/%d MHz"
-#define MIN_CHANDEF_PR_ARG	__entry->min_control_freq, __entry->min_chan_width,	\
-			__entry->min_center_freq1, __entry->min_center_freq2
+#define MIN_CHANDEF_PR_FMT	" min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz"
+#define MIN_CHANDEF_PR_ARG	__entry->min_control_freq, __entry->min_freq_offset,	\
+			__entry->min_chan_width,					\
+			__entry->min_center_freq1, __entry->min_freq1_offset,		\
+			__entry->min_center_freq2
 
 #define CHANCTX_ENTRY	CHANDEF_ENTRY							\
 			MIN_CHANDEF_ENTRY						\
@@ -412,6 +422,7 @@ TRACE_EVENT(drv_bss_info_changed,
 		__field(s32, cqm_rssi_hyst)
 		__field(u32, channel_width)
 		__field(u32, channel_cfreq1)
+		__field(u32, channel_cfreq1_offset)
 		__dynamic_array(u32, arp_addr_list,
 				info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
 					IEEE80211_BSS_ARP_ADDR_LIST_LEN :
@@ -452,6 +463,7 @@ TRACE_EVENT(drv_bss_info_changed,
 		__entry->cqm_rssi_hyst = info->cqm_rssi_hyst;
 		__entry->channel_width = info->chandef.width;
 		__entry->channel_cfreq1 = info->chandef.center_freq1;
+		__entry->channel_cfreq1_offset = info->chandef.freq1_offset;
 		__entry->arp_addr_cnt = info->arp_addr_cnt;
 		memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list,
 		       sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
@@ -1223,6 +1235,7 @@ TRACE_EVENT(drv_remain_on_channel,
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__field(int, center_freq)
+		__field(int, freq_offset)
 		__field(unsigned int, duration)
 		__field(u32, type)
 	),
@@ -1231,14 +1244,16 @@ TRACE_EVENT(drv_remain_on_channel,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		__entry->center_freq = chan->center_freq;
+		__entry->freq_offset = chan->freq_offset;
 		__entry->duration = duration;
 		__entry->type = type;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT " freq:%dMHz duration:%dms type=%d",
+		LOCAL_PR_FMT  VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d",
 		LOCAL_PR_ARG, VIF_PR_ARG,
-		__entry->center_freq, __entry->duration, __entry->type
+		__entry->center_freq, __entry->freq_offset,
+		__entry->duration, __entry->type
 	)
 );
 
@@ -1546,8 +1561,10 @@ struct trace_vif_entry {
 
 struct trace_chandef_entry {
 	u32 control_freq;
+	u32 freq_offset;
 	u32 chan_width;
 	u32 center_freq1;
+	u32 freq1_offset;
 	u32 center_freq2;
 } __packed;
 
@@ -1597,18 +1614,26 @@ TRACE_EVENT(drv_switch_vif_chanctx,
 					sizeof(local_vifs[i].vif.vif_name));
 				SWITCH_ENTRY_ASSIGN(old_chandef.control_freq,
 						old_ctx->def.chan->center_freq);
+				SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset,
+						old_ctx->def.chan->freq_offset);
 				SWITCH_ENTRY_ASSIGN(old_chandef.chan_width,
 						    old_ctx->def.width);
 				SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1,
 						    old_ctx->def.center_freq1);
+				SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset,
+						    old_ctx->def.freq1_offset);
 				SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2,
 						    old_ctx->def.center_freq2);
 				SWITCH_ENTRY_ASSIGN(new_chandef.control_freq,
 						new_ctx->def.chan->center_freq);
+				SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset,
+						new_ctx->def.chan->freq_offset);
 				SWITCH_ENTRY_ASSIGN(new_chandef.chan_width,
 						    new_ctx->def.width);
 				SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1,
 						    new_ctx->def.center_freq1);
+				SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset,
+						    new_ctx->def.freq1_offset);
 				SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2,
 						    new_ctx->def.center_freq2);
 			}
-- 
cgit v1.2.3


From 3b23c184f72acddad39c40373f165e1a9e384758 Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Wed, 1 Apr 2020 18:18:05 -0700
Subject: mac80211: add freq_offset to RX status

RX status needs a KHz component, so add freq_offset. We
can reduce the bits for the frequency since 60 GHz isn't
supported.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200402011810.22947-5-thomas@adapt-ip.com
[fix commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 10 +++++++++-
 net/mac80211/mlme.c    |  6 ++++--
 net/mac80211/rx.c      |  1 +
 net/mac80211/scan.c    |  3 ++-
 4 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f3147633dda2..2936049f918e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1335,6 +1335,7 @@ enum mac80211_rx_encoding {
  * @freq: frequency the radio was tuned to when receiving this frame, in MHz
  *	This field must be set for management frames, but isn't strictly needed
  *	for data (other) frames - for those it only affects radiotap reporting.
+ * @freq_offset: @freq has a positive offset of 500Khz.
  * @signal: signal strength when receiving this frame, either in dBm, in dB or
  *	unspecified depending on the hardware capabilities flags
  *	@IEEE80211_HW_SIGNAL_*
@@ -1365,7 +1366,7 @@ struct ieee80211_rx_status {
 	u32 device_timestamp;
 	u32 ampdu_reference;
 	u32 flag;
-	u16 freq;
+	u16 freq: 13, freq_offset: 1;
 	u8 enc_flags;
 	u8 encoding:2, bw:3, he_ru:3;
 	u8 he_gi:2, he_dcm:1;
@@ -1381,6 +1382,13 @@ struct ieee80211_rx_status {
 	u8 zero_length_psdu_type;
 };
 
+static inline u32
+ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status)
+{
+	return MHZ_TO_KHZ(rx_status->freq) +
+	       (rx_status->freq_offset ? 500 : 0);
+}
+
 /**
  * struct ieee80211_vendor_radiotap - vendor radiotap data information
  * @present: presence bitmap for this vendor namespace
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index acc8adf50d69..a259b4487b60 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3683,7 +3683,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 
 	sdata_assert_lock(sdata);
 
-	channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
+	channel = ieee80211_get_channel_khz(local->hw.wiphy,
+					ieee80211_rx_status_to_khz(rx_status));
 	if (!channel)
 		return;
 
@@ -3899,7 +3900,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	if (rx_status->freq != chanctx_conf->def.chan->center_freq) {
+	if (ieee80211_rx_status_to_khz(rx_status) !=
+	    ieee80211_channel_to_khz(chanctx_conf->def.chan)) {
 		rcu_read_unlock();
 		return;
 	}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a724551b8ddf..eaf8931e4627 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -412,6 +412,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	pos++;
 
 	/* IEEE80211_RADIOTAP_CHANNEL */
+	/* TODO: frequency offset in KHz */
 	put_unaligned_le16(status->freq, pos);
 	pos += 2;
 	if (status->bw == RATE_INFO_BW_10)
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 4d14118dddca..5db15996524f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -275,7 +275,8 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
 			return;
 	}
 
-	channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
+	channel = ieee80211_get_channel_khz(local->hw.wiphy,
+					ieee80211_rx_status_to_khz(rx_status));
 
 	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
 		return;
-- 
cgit v1.2.3


From be689f68d040702a3521035d267949d3927971f0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Apr 2020 12:01:04 +0200
Subject: cfg80211: reject channels/chandefs with KHz offset >= 1000

This should be covered by the next MHz, make sure that the
numbers are always normalized.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200424120103.12b91ecf75f9.I4bf499d58404283bbfacb517d614a816763bccf2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/chan.c | 3 +++
 net/wireless/core.c | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index d60e50a3b910..e111c08daa0e 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -147,6 +147,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 	if (!chandef->chan)
 		return false;
 
+	if (chandef->freq1_offset >= 1000)
+		return false;
+
 	control_freq = chandef->chan->center_freq;
 
 	switch (chandef->width) {
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5757dea2aa94..b795f363d004 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -834,6 +834,9 @@ int wiphy_register(struct wiphy *wiphy)
 			sband->channels[i].orig_mpwr =
 				sband->channels[i].max_power;
 			sband->channels[i].band = band;
+
+			if (WARN_ON(sband->channels[i].freq_offset >= 1000))
+				return -EINVAL;
 		}
 
 		for (i = 0; i < sband->n_iftype_data; i++) {
-- 
cgit v1.2.3


From b6b5c42e3bab939d357d800fd313e3c995164065 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Apr 2020 12:39:46 +0200
Subject: mac80211: fix two missing documentation entries

Add documentation for two struct entries that was missing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200424123945.6b23a26ab5e7.I664440ab5f33442df8103253bf5b9fe84be8d58c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 2 ++
 net/mac80211/sta_info.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2936049f918e..ecb219e3ec4f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -820,6 +820,8 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup
+ * @IEEE80211_TX_CTRL_HW_80211_ENCAP: This frame uses hardware encapsulation
+ *	(header conversion)
  *
  * These flags are used in tx_info->control.flags.
  */
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index a5de3aa6ea42..49728047dfad 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -69,6 +69,8 @@
  * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP.
  * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX
  *	until pending frames are delivered
+ * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption,
+ *	so drop all packets without a key later.
  *
  * @NUM_WLAN_STA_FLAGS: number of defined flags
  */
-- 
cgit v1.2.3


From e73f94d1b6f05f6f22434c63de255a9dec6fd23d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 24 Apr 2020 21:14:37 +0800
Subject: batman-adv: remove unused inline function batadv_arp_change_timeout

There's no callers in-tree.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/distributed-arp-table.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 2bff2f4a325c..4e031661682a 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -163,11 +163,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
 {
 }
 
-static inline void batadv_arp_change_timeout(struct net_device *soft_iface,
-					     const char *name)
-{
-}
-
 static inline int batadv_dat_init(struct batadv_priv *bat_priv)
 {
 	return 0;
-- 
cgit v1.2.3


From b70ba69ef1f72e58c8e43b5689a37a66a7b31d11 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 23 Apr 2020 16:57:45 +0200
Subject: net: sched: report ndo_setup_tc failures via extack

Help end-users of the 'tc' command to see if the drivers ndo_setup_tc
function call fails. Troubleshooting when this happens is non-trivial
(see full process here[1]), and results in net_device getting assigned
the 'qdisc noop', which will drop all TX packets on the interface.

[1]: https://github.com/xdp-project/xdp-project/blob/master/areas/arm64/board_nxp_ls1088/nxp-board04-troubleshoot-qdisc.org

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 55bd1429678f..11b683c45c28 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -735,8 +735,11 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
 	INIT_LIST_HEAD(&bo.cb_list);
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
-	if (err < 0)
+	if (err < 0) {
+		if (err != -EOPNOTSUPP)
+			NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
 		return err;
+	}
 
 	return tcf_block_setup(block, &bo);
 }
-- 
cgit v1.2.3


From 5c05c1dbb177293636a3f5ea4caa872dfcf50ccd Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 23 Apr 2020 17:02:56 +0100
Subject: net: phylink, dsa: eliminate phylink_fixed_state_cb()

Move the callback into the phylink_config structure, rather than
providing a callback to set this up.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 46 +++++++++++++++-------------------------------
 include/linux/phylink.h   |  6 +++---
 net/dsa/slave.c           | 20 +++++++++++---------
 3 files changed, 29 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 34ca12aec61b..0f23bec431c1 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -480,8 +480,8 @@ static void phylink_get_fixed_state(struct phylink *pl,
 				    struct phylink_link_state *state)
 {
 	*state = pl->link_config;
-	if (pl->get_fixed_state)
-		pl->get_fixed_state(pl->netdev, state);
+	if (pl->config->get_fixed_state)
+		pl->config->get_fixed_state(pl->config, state);
 	else if (pl->link_gpio)
 		state->link = !!gpiod_get_value_cansleep(pl->link_gpio);
 
@@ -1044,32 +1044,6 @@ void phylink_disconnect_phy(struct phylink *pl)
 }
 EXPORT_SYMBOL_GPL(phylink_disconnect_phy);
 
-/**
- * phylink_fixed_state_cb() - allow setting a fixed link callback
- * @pl: a pointer to a &struct phylink returned from phylink_create()
- * @cb: callback to execute to determine the fixed link state.
- *
- * The MAC driver should call this driver when the state of its link
- * can be determined through e.g: an out of band MMIO register.
- */
-int phylink_fixed_state_cb(struct phylink *pl,
-			   void (*cb)(struct net_device *dev,
-				      struct phylink_link_state *state))
-{
-	/* It does not make sense to let the link be overriden unless we use
-	 * MLO_AN_FIXED
-	 */
-	if (pl->cfg_link_an_mode != MLO_AN_FIXED)
-		return -EINVAL;
-
-	mutex_lock(&pl->state_mutex);
-	pl->get_fixed_state = cb;
-	mutex_unlock(&pl->state_mutex);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(phylink_fixed_state_cb);
-
 /**
  * phylink_mac_change() - notify phylink of a change in MAC state
  * @pl: a pointer to a &struct phylink returned from phylink_create()
@@ -1106,6 +1080,8 @@ static irqreturn_t phylink_link_handler(int irq, void *data)
  */
 void phylink_start(struct phylink *pl)
 {
+	bool poll = false;
+
 	ASSERT_RTNL();
 
 	phylink_info(pl, "configuring for %s/%s link mode\n",
@@ -1142,10 +1118,18 @@ void phylink_start(struct phylink *pl)
 				irq = 0;
 		}
 		if (irq <= 0)
-			mod_timer(&pl->link_poll, jiffies + HZ);
+			poll = true;
+	}
+
+	switch (pl->cfg_link_an_mode) {
+	case MLO_AN_FIXED:
+		poll |= pl->config->poll_fixed_state;
+		break;
+	case MLO_AN_INBAND:
+		poll |= pl->config->pcs_poll;
+		break;
 	}
-	if ((pl->cfg_link_an_mode == MLO_AN_FIXED && pl->get_fixed_state) ||
-	    pl->config->pcs_poll)
+	if (poll)
 		mod_timer(&pl->link_poll, jiffies + HZ);
 	if (pl->phydev)
 		phy_start(pl->phydev);
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 3f8d37ec5503..cc5b452a184e 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -67,6 +67,9 @@ struct phylink_config {
 	struct device *dev;
 	enum phylink_op_type type;
 	bool pcs_poll;
+	bool poll_fixed_state;
+	void (*get_fixed_state)(struct phylink_config *config,
+				struct phylink_link_state *state);
 };
 
 /**
@@ -366,9 +369,6 @@ void phylink_destroy(struct phylink *);
 int phylink_connect_phy(struct phylink *, struct phy_device *);
 int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags);
 void phylink_disconnect_phy(struct phylink *);
-int phylink_fixed_state_cb(struct phylink *,
-			   void (*cb)(struct net_device *dev,
-				      struct phylink_link_state *));
 
 void phylink_mac_change(struct phylink *, bool up);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f2c241cf3a80..1035230771ae 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1590,10 +1590,10 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
 }
 EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
 
-static void dsa_slave_phylink_fixed_state(struct net_device *dev,
+static void dsa_slave_phylink_fixed_state(struct phylink_config *config,
 					  struct phylink_link_state *state)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
 	/* No need to check that this operation is valid, the callback would
@@ -1633,6 +1633,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	dp->pl_config.dev = &slave_dev->dev;
 	dp->pl_config.type = PHYLINK_NETDEV;
 
+	/* The get_fixed_state callback takes precedence over polling the
+	 * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
+	 * this if the switch provides such a callback.
+	 */
+	if (ds->ops->phylink_fixed_state) {
+		dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state;
+		dp->pl_config.poll_fixed_state = true;
+	}
+
 	dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode,
 				&dsa_port_phylink_mac_ops);
 	if (IS_ERR(dp->pl)) {
@@ -1641,13 +1650,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 		return PTR_ERR(dp->pl);
 	}
 
-	/* Register only if the switch provides such a callback, since this
-	 * callback takes precedence over polling the link GPIO in PHYLINK
-	 * (see phylink_get_fixed_state).
-	 */
-	if (ds->ops->phylink_fixed_state)
-		phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
-
 	if (ds->ops->get_phy_flags)
 		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
 
-- 
cgit v1.2.3


From 071c8ed6e88d2ac0a5f26948fb9c288fd4dd6e40 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 24 Apr 2020 12:31:50 +0200
Subject: tcp: mptcp: use mptcp receive buffer space to select rcv window

In MPTCP, the receive window is shared across all subflows, because it
refers to the mptcp-level sequence space.

MPTCP receivers already place incoming packets on the mptcp socket
receive queue and will charge it to the mptcp socket rcvbuf until
userspace consumes the data.

Update __tcp_select_window to use the occupancy of the parent/mptcp
socket instead of the subflow socket in case the tcp socket is part
of a logical mptcp connection.

This commit doesn't change choice of initial window for passive or active
connections.
While it would be possible to change those as well, this adds complexity
(especially when handling MP_JOIN requests).  Furthermore, the MPTCP RFC
specifically says that a MPTCP sender 'MUST NOT use the RCV.WND field
of a TCP segment at the connection level if it does not also carry a DSS
option with a Data ACK field.'

SYN/SYNACK packets do not carry a DSS option with a Data ACK field.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h   |  3 +++
 net/ipv4/tcp_output.c |  8 ++++++--
 net/mptcp/subflow.c   | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 0e7c5471010b..5288fba56e55 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -68,6 +68,8 @@ static inline bool rsk_is_mptcp(const struct request_sock *req)
 	return tcp_rsk(req)->is_mptcp;
 }
 
+void mptcp_space(const struct sock *ssk, int *space, int *full_space);
+
 void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
 			int opsize, struct tcp_options_received *opt_rx);
 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
@@ -197,6 +199,7 @@ static inline bool mptcp_sk_is_subflow(const struct sock *sk)
 	return false;
 }
 
+static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
 static inline void mptcp_seq_show(struct seq_file *seq) { }
 #endif /* CONFIG_MPTCP */
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2f45cde168c4..ba4482130f08 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2772,8 +2772,12 @@ u32 __tcp_select_window(struct sock *sk)
 	int mss = icsk->icsk_ack.rcv_mss;
 	int free_space = tcp_space(sk);
 	int allowed_space = tcp_full_space(sk);
-	int full_space = min_t(int, tp->window_clamp, allowed_space);
-	int window;
+	int full_space, window;
+
+	if (sk_is_mptcp(sk))
+		mptcp_space(sk, &free_space, &allowed_space);
+
+	full_space = min_t(int, tp->window_clamp, allowed_space);
 
 	if (unlikely(mss > full_space)) {
 		mss = full_space;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fabd06f2ff45..87c094702d63 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -821,6 +821,24 @@ bool mptcp_subflow_data_available(struct sock *sk)
 	return subflow->data_avail;
 }
 
+/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
+ * not the ssk one.
+ *
+ * In mptcp, rwin is about the mptcp-level connection data.
+ *
+ * Data that is still on the ssk rx queue can thus be ignored,
+ * as far as mptcp peer is concerened that data is still inflight.
+ * DSS ACK is updated when skb is moved to the mptcp rx queue.
+ */
+void mptcp_space(const struct sock *ssk, int *space, int *full_space)
+{
+	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+	const struct sock *sk = subflow->conn;
+
+	*space = tcp_space(sk);
+	*full_space = tcp_full_space(sk);
+}
+
 static void subflow_data_ready(struct sock *sk)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-- 
cgit v1.2.3


From f30e472071c827e2c2f191c8a011b7e371e17af5 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 24 Apr 2020 12:43:09 +0000
Subject: hsr: remove unnecessary code in hsr_dev_change_mtu()

In the hsr_dev_change_mtu(), the 'dev' and 'master->dev' pointer are
same. So, the 'master' variable and some code are unnecessary.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_device.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index fc7027314ad8..cd99f548e440 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -125,13 +125,11 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)
 static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct hsr_priv *hsr;
-	struct hsr_port *master;
 
 	hsr = netdev_priv(dev);
-	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
 
 	if (new_mtu > hsr_get_max_mtu(hsr)) {
-		netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+		netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
 			    HSR_HLEN);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From df346f1aac6cef551b69d788b022a942270dc17b Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 24 Apr 2020 21:13:34 +0800
Subject: dccp: remove unused inline function dccp_set_seqno

There's no callers in-tree since commit 792b48780e8b ("dccp: Implement
both feature-local and feature-remote Sequence Window feature")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/dccp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 9c3b27c257bb..7dce4f6c7025 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -108,11 +108,6 @@ extern int  sysctl_dccp_sync_ratelimit;
 #define ADD48(a, b)	 (((a) + (b)) & UINT48_MAX)
 #define SUB48(a, b)	 ADD48((a), COMPLEMENT48(b))
 
-static inline void dccp_set_seqno(u64 *seqno, u64 value)
-{
-	*seqno = value & UINT48_MAX;
-}
-
 static inline void dccp_inc_seqno(u64 *seqno)
 {
 	*seqno = ADD48(*seqno, 1);
-- 
cgit v1.2.3


From 4b36a0dff794a00989a50581aed2f94c88b57107 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sat, 25 Apr 2020 11:39:47 +0800
Subject: net: openvswitch: suitable access to the dp_meters

To fix the following sparse warning:
| net/openvswitch/meter.c:109:38: sparse: sparse: incorrect type
| in assignment (different address spaces) ...
| net/openvswitch/meter.c:720:45: sparse: sparse: incorrect type
| in argument 1 (different address spaces) ...

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 915f31123f23..612ad5586ce9 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -107,8 +107,8 @@ dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size)
 		return -ENOMEM;
 
 	for (i = 0; i < n_meters; i++)
-		new_ti->dp_meters[i] =
-			rcu_dereference_ovsl(ti->dp_meters[i]);
+		if (rcu_dereference_ovsl(ti->dp_meters[i]))
+			new_ti->dp_meters[i] = ti->dp_meters[i];
 
 	rcu_assign_pointer(tbl->ti, new_ti);
 	call_rcu(&ti->rcu, dp_meter_instance_free_rcu);
@@ -752,7 +752,7 @@ void ovs_meters_exit(struct datapath *dp)
 	int i;
 
 	for (i = 0; i < ti->n_meters; i++)
-		ovs_meter_free(ti->dp_meters[i]);
+		ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i]));
 
 	dp_meter_instance_free(ti);
 }
-- 
cgit v1.2.3


From 659d4587fe7233bfdff303744b20d6f41ad04362 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sat, 25 Apr 2020 11:39:48 +0800
Subject: net: openvswitch: use div_u64() for 64-by-32 divisions

Compile the kernel for arm 32 platform, the build warning found.
To fix that, should use div_u64() for divisions.
| net/openvswitch/meter.c:396: undefined reference to `__udivdi3'

[add more commit msg, change reported tag, and use div_u64 instead
of do_div by Tonghao]

Fixes: e57358873bb5d6ca ("net: openvswitch: use u64 for meter bucket")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Tested-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/meter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 612ad5586ce9..3d3d8e094546 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -393,7 +393,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
 		 * Start with a full bucket.
 		 */
 		band->bucket = (band->burst_size + band->rate) * 1000ULL;
-		band_max_delta_t = band->bucket / band->rate;
+		band_max_delta_t = div_u64(band->bucket, band->rate);
 		if (band_max_delta_t > meter->max_delta_t)
 			meter->max_delta_t = band_max_delta_t;
 		band++;
-- 
cgit v1.2.3


From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 20 Apr 2020 10:46:10 -0700
Subject: bpf: Enable more helpers for
 BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}

Currently the following prog types don't fall back to bpf_base_func_proto()
(instead they have cgroup_base_func_proto which has a limited set of
helpers from bpf_base_func_proto):
* BPF_PROG_TYPE_CGROUP_DEVICE
* BPF_PROG_TYPE_CGROUP_SYSCTL
* BPF_PROG_TYPE_CGROUP_SOCKOPT

I don't see any specific reason why we shouldn't use bpf_base_func_proto(),
every other type of program (except bpf-lirc and, understandably, tracing)
use it, so let's fall back to bpf_base_func_proto for those prog types
as well.

This basically boils down to adding access to the following helpers:
* BPF_FUNC_get_prandom_u32
* BPF_FUNC_get_smp_processor_id
* BPF_FUNC_get_numa_node_id
* BPF_FUNC_tail_call
* BPF_FUNC_ktime_get_ns
* BPF_FUNC_spin_lock (CAP_SYS_ADMIN)
* BPF_FUNC_spin_unlock (CAP_SYS_ADMIN)
* BPF_FUNC_jiffies64 (CAP_SYS_ADMIN)

I've also added bpf_perf_event_output() because it's really handy for
logging and debugging.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com
---
 include/linux/bpf.h                                |  1 +
 kernel/bpf/cgroup.c                                | 20 +++---------------
 net/core/filter.c                                  |  2 +-
 .../testing/selftests/bpf/verifier/event_output.c  | 24 ++++++++++++++++++++++
 4 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fd2b2322412d..25da6ff2a880 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto;
 extern const struct bpf_func_proto bpf_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_jiffies64_proto;
 extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_event_output_data_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..4d748c5785bc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1060,30 +1060,16 @@ static const struct bpf_func_proto *
 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
-	case BPF_FUNC_map_lookup_elem:
-		return &bpf_map_lookup_elem_proto;
-	case BPF_FUNC_map_update_elem:
-		return &bpf_map_update_elem_proto;
-	case BPF_FUNC_map_delete_elem:
-		return &bpf_map_delete_elem_proto;
-	case BPF_FUNC_map_push_elem:
-		return &bpf_map_push_elem_proto;
-	case BPF_FUNC_map_pop_elem:
-		return &bpf_map_pop_elem_proto;
-	case BPF_FUNC_map_peek_elem:
-		return &bpf_map_peek_elem_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
-	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
-			return bpf_get_trace_printk_proto();
-		/* fall through */
+	case BPF_FUNC_perf_event_output:
+		return &bpf_event_output_data_proto;
 	default:
-		return NULL;
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 7d6ceaa54d21..a943df3ad8b0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags
 	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
 }
 
-static const struct bpf_func_proto bpf_event_output_data_proto =  {
+const struct bpf_func_proto bpf_event_output_data_proto =  {
 	.func		= bpf_event_output_data,
 	.gpl_only       = true,
 	.ret_type       = RET_INTEGER,
diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c
index 130553e19eca..99f8f582c02b 100644
--- a/tools/testing/selftests/bpf/verifier/event_output.c
+++ b/tools/testing/selftests/bpf/verifier/event_output.c
@@ -92,3 +92,27 @@
 	.result = ACCEPT,
 	.retval = 1,
 },
+{
+	"perfevent for cgroup dev",
+	.insns =  { __PERF_EVENT_INSNS__ },
+	.prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
+	.fixup_map_event_output = { 4 },
+	.result = ACCEPT,
+	.retval = 1,
+},
+{
+	"perfevent for cgroup sysctl",
+	.insns =  { __PERF_EVENT_INSNS__ },
+	.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+	.fixup_map_event_output = { 4 },
+	.result = ACCEPT,
+	.retval = 1,
+},
+{
+	"perfevent for cgroup sockopt",
+	.insns =  { __PERF_EVENT_INSNS__ },
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	.fixup_map_event_output = { 4 },
+	.result = ACCEPT,
+	.retval = 1,
+},
-- 
cgit v1.2.3


From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 24 Apr 2020 16:59:41 -0700
Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for
 CGROUP_NET=n

linux-next build bot reported compile issue [1] with one of its
configs. It looks like when we have CONFIG_NET=n and
CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto
definition (from net/core/filter.c) in cgroup_base_func_proto.

I'm reshuffling the code a bit to make it work. The common helpers
are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is
exported from there.
Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing
bpf_user_rnd_u32.

[1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72

Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com
---
 include/linux/bpf.h    |  8 ++++++
 include/linux/filter.h |  2 --
 kernel/bpf/core.c      |  5 ++++
 kernel/bpf/helpers.c   | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/filter.c      | 78 +-------------------------------------------------
 5 files changed, 87 insertions(+), 79 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 25da6ff2a880..5147e11e53ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1215,6 +1215,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1365,6 +1366,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
+
+static inline const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+	return NULL;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -1531,6 +1538,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto(
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 #if defined(CONFIG_NET)
 bool bpf_sock_common_is_valid_access(int off, int size,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9b5aa5c483cc..af37318bb1c5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
 			      bpf_aux_classic_check_t trans, bool save_orig);
 void bpf_prog_destroy(struct bpf_prog *fp);
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 916f5132a984..0cc91805069a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
 	return res;
 }
 
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+	return raw_smp_processor_id();
+}
+
 /* Weak definitions of helper functions in case we don't have bpf syscall. */
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bafc53ddd350..dbba4f41d508 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
 	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
 	.arg4_type      = ARG_CONST_SIZE,
 };
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+	.func		= bpf_get_raw_cpu_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+	   u64, flags, void *, data, u64, size)
+{
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+
+	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto =  {
+	.func		= bpf_event_output_data,
+	.gpl_only       = true,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_PTR_TO_MEM,
+	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_map_push_elem:
+		return &bpf_map_push_elem_proto;
+	case BPF_FUNC_map_pop_elem:
+		return &bpf_map_pop_elem_proto;
+	case BPF_FUNC_map_peek_elem:
+		return &bpf_map_peek_elem_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_raw_smp_processor_id_proto;
+	case BPF_FUNC_get_numa_node_id:
+		return &bpf_get_numa_node_id_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	default:
+		break;
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return NULL;
+
+	switch (func_id) {
+	case BPF_FUNC_spin_lock:
+		return &bpf_spin_lock_proto;
+	case BPF_FUNC_spin_unlock:
+		return &bpf_spin_unlock_proto;
+	case BPF_FUNC_trace_printk:
+		return bpf_get_trace_printk_proto();
+	case BPF_FUNC_jiffies64:
+		return &bpf_jiffies64_proto;
+	default:
+		return NULL;
+	}
+}
diff --git a/net/core/filter.c b/net/core/filter.c
index a943df3ad8b0..a605626142b6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
 					  offset);
 }
 
-BPF_CALL_0(bpf_get_raw_cpu_id)
-{
-	return raw_smp_processor_id();
-}
-
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
-	.func		= bpf_get_raw_cpu_id,
-	.gpl_only	= false,
-	.ret_type	= RET_INTEGER,
-};
-
 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 			      struct bpf_insn *insn_buf)
 {
@@ -4205,26 +4194,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
-	   void *, data, u64, size)
-{
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
-
-	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
-}
-
-const struct bpf_func_proto bpf_event_output_data_proto =  {
-	.func		= bpf_event_output_data,
-	.gpl_only       = true,
-	.ret_type       = RET_INTEGER,
-	.arg1_type      = ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_PTR_TO_MEM,
-	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
-};
-
 BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 	   int, level, int, optname, char *, optval, int, optlen)
 {
@@ -5983,52 +5952,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	return false;
 }
 
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
-{
-	switch (func_id) {
-	case BPF_FUNC_map_lookup_elem:
-		return &bpf_map_lookup_elem_proto;
-	case BPF_FUNC_map_update_elem:
-		return &bpf_map_update_elem_proto;
-	case BPF_FUNC_map_delete_elem:
-		return &bpf_map_delete_elem_proto;
-	case BPF_FUNC_map_push_elem:
-		return &bpf_map_push_elem_proto;
-	case BPF_FUNC_map_pop_elem:
-		return &bpf_map_pop_elem_proto;
-	case BPF_FUNC_map_peek_elem:
-		return &bpf_map_peek_elem_proto;
-	case BPF_FUNC_get_prandom_u32:
-		return &bpf_get_prandom_u32_proto;
-	case BPF_FUNC_get_smp_processor_id:
-		return &bpf_get_raw_smp_processor_id_proto;
-	case BPF_FUNC_get_numa_node_id:
-		return &bpf_get_numa_node_id_proto;
-	case BPF_FUNC_tail_call:
-		return &bpf_tail_call_proto;
-	case BPF_FUNC_ktime_get_ns:
-		return &bpf_ktime_get_ns_proto;
-	default:
-		break;
-	}
-
-	if (!capable(CAP_SYS_ADMIN))
-		return NULL;
-
-	switch (func_id) {
-	case BPF_FUNC_spin_lock:
-		return &bpf_spin_lock_proto;
-	case BPF_FUNC_spin_unlock:
-		return &bpf_spin_unlock_proto;
-	case BPF_FUNC_trace_printk:
-		return bpf_get_trace_printk_proto();
-	case BPF_FUNC_jiffies64:
-		return &bpf_jiffies64_proto;
-	default:
-		return NULL;
-	}
-}
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
 
 static const struct bpf_func_proto *
 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-- 
cgit v1.2.3


From 6f3f65d80dac8f2bafce2213005821fccdce194c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 20 Apr 2020 11:34:08 -0700
Subject: net: bpf: Allow TC programs to call BPF_FUNC_skb_change_head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows TC eBPF programs to modify and forward (redirect) packets
from interfaces without ethernet headers (for example cellular)
to interfaces with (for example ethernet/wifi).

The lack of this appears to simply be an oversight.

Tested:
  in active use in Android R on 4.14+ devices for ipv6
  cellular to wifi tethering offload.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a605626142b6..da3b7a72c37c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6137,6 +6137,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_adjust_room_proto;
 	case BPF_FUNC_skb_change_tail:
 		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_change_head:
+		return &bpf_skb_change_head_proto;
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-- 
cgit v1.2.3


From 0a05861f80fe7d4dcfdabcc98d9854947573e072 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 22 Apr 2020 01:29:27 +0200
Subject: xsk: Fix typo in xsk_umem_consume_tx and xsk_generic_xmit comments

s/backpreassure/backpressure/

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20200421232927.21082-1-tklauser@distanz.ch
---
 net/xdp/xsk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c350108aa38d..f6e6609f70a3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -322,7 +322,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 		if (!xskq_cons_peek_desc(xs->tx, desc, umem))
 			continue;
 
-		/* This is the backpreassure mechanism for the Tx path.
+		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
@@ -406,7 +406,7 @@ static int xsk_generic_xmit(struct sock *sk)
 		addr = desc.addr;
 		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
-		/* This is the backpreassure mechanism for the Tx path.
+		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
-- 
cgit v1.2.3


From 74f99482eae03195ced512b440b31d62bdb6e943 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Tue, 21 Apr 2020 10:04:16 -0500
Subject: netfilter: nf_conntrack: add IPS_HW_OFFLOAD status bit

This bit indicates that the conntrack entry is offloaded to hardware
flow table. nf_conntrack entry will be tagged with [HW_OFFLOAD] if
it's offload to hardware.

cat /proc/net/nf_conntrack
	ipv4 2 tcp 6 \
	src=1.1.1.17 dst=1.1.1.16 sport=56394 dport=5001 \
	src=1.1.1.16 dst=1.1.1.17 sport=5001 dport=56394 [HW_OFFLOAD] \
	mark=0 zone=0 use=3

Note that HW_OFFLOAD/OFFLOAD/ASSURED are mutually exclusive.

Changelog:

* V1->V2:
- Remove check of lastused from stats. It was meant for cases such
  as removing driver module while traffic still running. Better to
  handle such cases from garbage collector.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 8 ++++++--
 net/netfilter/nf_conntrack_standalone.c            | 4 +++-
 net/netfilter/nf_flow_table_offload.c              | 3 +++
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index b6f0bb1dc799..4b3395082d15 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -114,15 +114,19 @@ enum ip_conntrack_status {
 	IPS_OFFLOAD_BIT = 14,
 	IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
 
+	/* Conntrack has been offloaded to hardware. */
+	IPS_HW_OFFLOAD_BIT = 15,
+	IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT),
+
 	/* Be careful here, modifying these bits can make things messy,
 	 * so don't let users modify them directly.
 	 */
 	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
 				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
 				 IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED |
-				 IPS_OFFLOAD),
+				 IPS_OFFLOAD | IPS_HW_OFFLOAD),
 
-	__IPS_MAX_BIT = 15,
+	__IPS_MAX_BIT = 16,
 };
 
 /* Connection tracking event types */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9b57330c81f8..5a3e6c43ee68 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -348,7 +348,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
 		goto release;
 
-	if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+	if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status))
+		seq_puts(s, "[HW_OFFLOAD] ");
+	else if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
 		seq_puts(s, "[OFFLOAD] ");
 	else if (test_bit(IPS_ASSURED_BIT, &ct->status))
 		seq_puts(s, "[ASSURED] ");
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index e3b099c14eff..a2abb0feab7f 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -754,12 +754,15 @@ static void flow_offload_work_add(struct flow_offload_work *offload)
 	err = flow_offload_rule_add(offload, flow_rule);
 	if (err < 0)
 		set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags);
+	else
+		set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
 
 	nf_flow_offload_destroy(flow_rule);
 }
 
 static void flow_offload_work_del(struct flow_offload_work *offload)
 {
+	clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
 	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
 	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
 	set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
-- 
cgit v1.2.3


From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2020 08:43:38 +0200
Subject: sysctl: pass kernel pointers to ->proc_handler

Instead of having all the sysctl handlers deal with user pointers, which
is rather hairy in terms of the BPF interaction, copy the input to and
from  userspace in common code.  This also means that the strings are
always NUL-terminated by the common code, making the API a little bit
safer.

As most handler just pass through the data to one of the common handlers
a lot of the changes are mechnical.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm64/kernel/armv8_deprecated.c    |   2 +-
 arch/arm64/kernel/fpsimd.c              |   3 +-
 arch/mips/lasat/sysctl.c                |  13 +-
 arch/s390/appldata/appldata_base.c      |  11 +-
 arch/s390/kernel/debug.c                |   2 +-
 arch/s390/kernel/topology.c             |   2 +-
 arch/s390/mm/cmm.c                      |  12 +-
 arch/x86/kernel/itmt.c                  |   3 +-
 drivers/cdrom/cdrom.c                   |   2 +-
 drivers/char/random.c                   |   2 +-
 drivers/macintosh/mac_hid.c             |   3 +-
 drivers/parport/procfs.c                |  39 +++---
 fs/dcache.c                             |   2 +-
 fs/drop_caches.c                        |   2 +-
 fs/file_table.c                         |   4 +-
 fs/fscache/main.c                       |   3 +-
 fs/inode.c                              |   2 +-
 fs/proc/proc_sysctl.c                   |  47 ++++---
 fs/quota/dquot.c                        |   2 +-
 fs/xfs/xfs_sysctl.c                     |   4 +-
 include/linux/bpf-cgroup.h              |   9 +-
 include/linux/compaction.h              |   2 +-
 include/linux/fs.h                      |   6 +-
 include/linux/ftrace.h                  |   3 +-
 include/linux/hugetlb.h                 |  15 +-
 include/linux/kprobes.h                 |   2 +-
 include/linux/latencytop.h              |   4 +-
 include/linux/mm.h                      |  12 +-
 include/linux/mmzone.h                  |  23 ++-
 include/linux/nmi.h                     |  15 +-
 include/linux/perf_event.h              |  13 +-
 include/linux/printk.h                  |   2 +-
 include/linux/sched/sysctl.h            |  44 ++----
 include/linux/security.h                |   2 +-
 include/linux/sysctl.h                  |  53 +++----
 include/linux/timer.h                   |   3 +-
 include/linux/vmstat.h                  |   8 +-
 include/linux/writeback.h               |  28 ++--
 ipc/ipc_sysctl.c                        |  10 +-
 ipc/mq_sysctl.c                         |   4 +-
 kernel/bpf/cgroup.c                     |  35 ++---
 kernel/events/callchain.c               |   2 +-
 kernel/events/core.c                    |   6 +-
 kernel/kprobes.c                        |   2 +-
 kernel/latencytop.c                     |   4 +-
 kernel/pid_namespace.c                  |   2 +-
 kernel/printk/printk.c                  |   2 +-
 kernel/sched/core.c                     |   9 +-
 kernel/sched/fair.c                     |   3 +-
 kernel/sched/rt.c                       |  10 +-
 kernel/sched/topology.c                 |   2 +-
 kernel/seccomp.c                        |   2 +-
 kernel/sysctl.c                         | 239 ++++++++++++--------------------
 kernel/time/timer.c                     |   3 +-
 kernel/trace/trace.c                    |   2 +-
 kernel/umh.c                            |   2 +-
 kernel/utsname_sysctl.c                 |   2 +-
 kernel/watchdog.c                       |  12 +-
 mm/compaction.c                         |   2 +-
 mm/hugetlb.c                            |   9 +-
 mm/page-writeback.c                     |  16 +--
 mm/page_alloc.c                         |  30 ++--
 mm/util.c                               |  10 +-
 mm/vmstat.c                             |   4 +-
 net/bridge/br_netfilter_hooks.c         |   2 +-
 net/core/neighbour.c                    |  28 ++--
 net/core/sysctl_net_core.c              |  27 ++--
 net/decnet/dn_dev.c                     |   7 +-
 net/decnet/sysctl_net_decnet.c          |  27 ++--
 net/ipv4/devinet.c                      |   9 +-
 net/ipv4/route.c                        |   3 +-
 net/ipv4/sysctl_net_ipv4.c              |  38 ++---
 net/ipv6/addrconf.c                     |  33 ++---
 net/ipv6/ndisc.c                        |   3 +-
 net/ipv6/route.c                        |   5 +-
 net/ipv6/sysctl_net_ipv6.c              |   3 +-
 net/mpls/af_mpls.c                      |   5 +-
 net/netfilter/ipvs/ip_vs_ctl.c          |   6 +-
 net/netfilter/nf_conntrack_standalone.c |   2 +-
 net/netfilter/nf_log.c                  |   2 +-
 net/phonet/sysctl.c                     |   3 +-
 net/rds/tcp.c                           |   6 +-
 net/sctp/sysctl.c                       |  32 ++---
 net/sunrpc/sysctl.c                     |  29 ++--
 net/sunrpc/xprtrdma/svc_rdma.c          |   7 +-
 security/apparmor/lsm.c                 |   2 +-
 security/min_addr.c                     |   2 +-
 security/yama/yama_lsm.c                |   2 +-
 88 files changed, 458 insertions(+), 653 deletions(-)

(limited to 'net')

diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c19aa81ddc8c..7364de008bab 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops)
 }
 
 static int emulation_proc_handler(struct ctl_table *table, int write,
-				  void __user *buffer, size_t *lenp,
+				  void *buffer, size_t *lenp,
 				  loff_t *ppos)
 {
 	int ret = 0;
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 94289d126993..35cb5e66c504 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl)
 #ifdef CONFIG_SYSCTL
 
 static int sve_proc_do_default_vl(struct ctl_table *table, int write,
-				  void __user *buffer, size_t *lenp,
-				  loff_t *ppos)
+				  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int vl = sve_default_vl;
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index e666fe26c50d..2119541a5b8b 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write,
 		len = 0;
 		p = buffer;
 		while (len < *lenp) {
-			if (get_user(c, p++))
-				return -EFAULT;
+			c = *p;
+			p++;
 			if (c == 0 || c == '\n')
 				break;
 			len++;
 		}
 		if (len >= sizeof(ipbuf)-1)
 			len = sizeof(ipbuf) - 1;
-		if (copy_from_user(ipbuf, buffer, len))
-			return -EFAULT;
+		memcpy(ipbuf, buffer, len);
 		ipbuf[len] = 0;
 		*ppos += *lenp;
 		/* Now see if we can convert it to a valid IP */
@@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write,
 		if (len > *lenp)
 			len = *lenp;
 		if (len)
-			if (copy_to_user(buffer, ipbuf, len))
-				return -EFAULT;
+			memcpy(buffer, ipbuf, len);
 		if (len < *lenp) {
-			if (put_user('\n', ((char *) buffer) + len))
-				return -EFAULT;
+			*((char *)buffer + len) = '\n';
 			len++;
 		}
 		*lenp = len;
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index aa738cad1338..d74a4c7d5df6 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev;
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
 static int appldata_timer_handler(struct ctl_table *ctl, int write,
-				  void __user *buffer, size_t *lenp, loff_t *ppos);
+				  void *buffer, size_t *lenp, loff_t *ppos);
 static int appldata_interval_handler(struct ctl_table *ctl, int write,
-					 void __user *buffer,
-					 size_t *lenp, loff_t *ppos);
+				     void *buffer, size_t *lenp, loff_t *ppos);
 
 static struct ctl_table_header *appldata_sysctl_header;
 static struct ctl_table appldata_table[] = {
@@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd)
  */
 static int
 appldata_timer_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int timer_active = appldata_timer_active;
 	int rc;
@@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
  */
 static int
 appldata_interval_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int interval = appldata_interval;
 	int rc;
@@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
  */
 static int
 appldata_generic_handler(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct appldata_ops *ops = NULL, *tmp_ops;
 	struct list_head *lh;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..636446003a06 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -867,7 +867,7 @@ static int debug_active = 1;
  * if debug_active is already off
  */
 static int s390dbf_procactive(struct ctl_table *table, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+			      void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!write || debug_stoppable || !debug_active)
 		return proc_dointvec(table, write, buffer, lenp, ppos);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 5f70cefc13e4..332b542548cd 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -594,7 +594,7 @@ static int __init topology_setup(char *str)
 early_param("topology", topology_setup);
 
 static int topology_ctl_handler(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int enabled = topology_is_enabled();
 	int new_mode;
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index ae989b740376..36bce727897b 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
 }
 
 static int cmm_pages_handler(struct ctl_table *ctl, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	long nr = cmm_get_pages();
 	struct ctl_table ctl_entry = {
@@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
 }
 
 static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
-				   void __user *buffer, size_t *lenp,
+				   void *buffer, size_t *lenp,
 				   loff_t *ppos)
 {
 	long nr = cmm_get_timed_pages();
@@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
 }
 
 static int cmm_timeout_handler(struct ctl_table *ctl, int write,
-			       void __user *buffer, size_t *lenp, loff_t *ppos)
+			       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
 	long nr, seconds;
@@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 
 	if (write) {
 		len = min(*lenp, sizeof(buf));
-		if (copy_from_user(buf, buffer, len))
-			return -EFAULT;
+		memcpy(buf, buffer, len);
 		buf[len - 1] = '\0';
 		cmm_skip_blanks(buf, &p);
 		nr = simple_strtoul(p, &p, 0);
@@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
 			      cmm_timeout_pages, cmm_timeout_seconds);
 		if (len > *lenp)
 			len = *lenp;
-		if (copy_to_user(buffer, buf, len))
-			return -EFAULT;
+		memcpy(buffer, buf, len);
 		*lenp = len;
 		*ppos += len;
 	}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1cb3ca9bba49..1afbdd1dd777 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable;
 unsigned int __read_mostly sysctl_sched_itmt_enabled;
 
 static int sched_itmt_update_handler(struct ctl_table *table, int write,
-				     void __user *buffer, size_t *lenp,
-				     loff_t *ppos)
+				     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int old_sysctl;
 	int ret;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index faca0f346fff..e3bbe108eb54 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void)
 }
 
 static int cdrom_sysctl_handler(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0d10e31fd342..1e0db78b83ba 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2057,7 +2057,7 @@ static char sysctl_bootid[16];
  * sysctl system call, as 16 bytes of binary data.
  */
 static int proc_do_uuid(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+			void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table fake_table;
 	unsigned char buf[64], tmp_uuid[16], *uuid;
diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index 7af0c536d568..28b8581b44dd 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void)
 }
 
 static int mac_hid_toggle_emumouse(struct ctl_table *table, int write,
-				   void __user *buffer, size_t *lenp,
-				   loff_t *ppos)
+				   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int old_val = *valp;
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 48804049d697..ee7b5daabfd4 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -34,7 +34,7 @@
 #define PARPORT_MAX_SPINTIME_VALUE 1000
 
 static int do_active_device(struct ctl_table *table, int write,
-		      void __user *result, size_t *lenp, loff_t *ppos)
+		      void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
 	char buffer[256];
@@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write,
 		*lenp = len;
 
 	*ppos += len;
-
-	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
 static int do_autoprobe(struct ctl_table *table, int write,
-			void __user *result, size_t *lenp, loff_t *ppos)
+			void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport_device_info *info = table->extra2;
 	const char *str;
@@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write,
 
 	*ppos += len;
 
-	return copy_to_user (result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 #endif /* IEEE1284.3 support. */
 
 static int do_hardware_base_addr(struct ctl_table *table, int write,
-				 void __user *result,
-				 size_t *lenp, loff_t *ppos)
+				 void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
 	char buffer[20];
@@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write,
 		*lenp = len;
 
 	*ppos += len;
-
-	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 
 static int do_hardware_irq(struct ctl_table *table, int write,
-			   void __user *result,
-			   size_t *lenp, loff_t *ppos)
+			   void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
 	char buffer[20];
@@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write,
 		*lenp = len;
 
 	*ppos += len;
-
-	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 
 static int do_hardware_dma(struct ctl_table *table, int write,
-			   void __user *result,
-			   size_t *lenp, loff_t *ppos)
+			   void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
 	char buffer[20];
@@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write,
 		*lenp = len;
 
 	*ppos += len;
-
-	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 
 static int do_hardware_modes(struct ctl_table *table, int write,
-			     void __user *result,
-			     size_t *lenp, loff_t *ppos)
+			     void *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
 	char buffer[40];
@@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write,
 		*lenp = len;
 
 	*ppos += len;
-
-	return copy_to_user(result, buffer, len) ? -EFAULT : 0;
+	memcpy(result, buffer, len);
+	return 0;
 }
 
 #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD }
diff --git a/fs/dcache.c b/fs/dcache.c
index b280e07e162b..8dd4d8d7bd0b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void)
 	return sum < 0 ? 0 : sum;
 }
 
-int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
+int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index dc1a1d5d825b..f00fcc4a4f72 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 }
 
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int ret;
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 30d55c9a1744..3b612535391f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
  */
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_files(struct ctl_table *table, int write,
-                     void __user *buffer, size_t *lenp, loff_t *ppos)
+                     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(struct ctl_table *table, int write,
-                     void __user *buffer, size_t *lenp, loff_t *ppos)
+                     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 59c2494efda3..c1e6cc9091aa 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2;
 static struct ctl_table_header *fscache_sysctl_header;
 
 static int fscache_max_active_sysctl(struct ctl_table *table, int write,
-				     void __user *buffer,
-				     size_t *lenp, loff_t *ppos)
+				     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct workqueue_struct **wqp = table->extra1;
 	unsigned int *datap = table->data;
diff --git a/fs/inode.c b/fs/inode.c
index 93d9252a00ab..cc6e701b7e5d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void)
  */
 #ifdef CONFIG_SYSCTL
 int proc_nr_inodes(struct ctl_table *table, int write,
-		   void __user *buffer, size_t *lenp, loff_t *ppos)
+		   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
 	inodes_stat.nr_unused = get_nr_inodes_unused();
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b6f5d459b087..df2143e05c57 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -539,13 +539,13 @@ out:
 	return err;
 }
 
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf,
 		size_t count, loff_t *ppos, int write)
 {
 	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
-	void *new_buf = NULL;
+	void *kbuf;
 	ssize_t error;
 
 	if (IS_ERR(head))
@@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	if (!table->proc_handler)
 		goto out;
 
-	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
-					   ppos, &new_buf);
+	if (write) {
+		kbuf = memdup_user_nul(ubuf, count);
+		if (IS_ERR(kbuf)) {
+			error = PTR_ERR(kbuf);
+			goto out;
+		}
+	} else {
+		error = -ENOMEM;
+		kbuf = kzalloc(count, GFP_KERNEL);
+		if (!kbuf)
+			goto out;
+	}
+
+	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+					   ppos);
 	if (error)
-		goto out;
+		goto out_free_buf;
 
 	/* careful: calling conventions are nasty here */
-	if (new_buf) {
-		mm_segment_t old_fs;
-
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		error = table->proc_handler(table, write, (void __user *)new_buf,
-					    &count, ppos);
-		set_fs(old_fs);
-		kfree(new_buf);
-	} else {
-		error = table->proc_handler(table, write, buf, &count, ppos);
+	error = table->proc_handler(table, write, kbuf, &count, ppos);
+	if (error)
+		goto out_free_buf;
+
+	if (!write) {
+		error = -EFAULT;
+		if (copy_to_user(ubuf, kbuf, count))
+			goto out_free_buf;
 	}
 
-	if (!error)
-		error = count;
+	error = count;
+out_free_buf:
+	kfree(kbuf);
 out:
 	sysctl_head_finish(head);
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b6a4f692d345..7b4bac91146b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
 EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
 
 static int do_proc_dqstats(struct ctl_table *table, int write,
-		     void __user *buffer, size_t *lenp, loff_t *ppos)
+		     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int type = (unsigned long *)table->data - dqstats.stat;
 	s64 value = percpu_counter_sum(&dqstats.counter[type]);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 31b3bdbd2eba..021ef96d0542 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -13,7 +13,7 @@ STATIC int
 xfs_stats_clear_proc_handler(
 	struct ctl_table	*ctl,
 	int			write,
-	void			__user *buffer,
+	void			*buffer,
 	size_t			*lenp,
 	loff_t			*ppos)
 {
@@ -33,7 +33,7 @@ STATIC int
 xfs_panic_mask_proc_handler(
 	struct ctl_table	*ctl,
 	int			write,
-	void			__user *buffer,
+	void			*buffer,
 	size_t			*lenp,
 	loff_t			*ppos)
 {
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c11b413d5b1a..0b41fd5fc96b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   void __user *buf, size_t *pcount,
-				   loff_t *ppos, void **new_buf,
+				   void **buf, size_t *pcount, loff_t *ppos,
 				   enum bpf_attach_type type);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
@@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf)  \
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos)  \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
-						       buf, count, pos, nbuf,  \
+						       buf, count, pos,        \
 						       BPF_CGROUP_SYSCTL);     \
 	__ret;								       \
 })
@@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 				       optlen, max_optlen, retval) ({ retval; })
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4b898cdbdf05..a0eabfbeb0e1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order)
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
-			void __user *buffer, size_t *length, loff_t *ppos);
+			void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6f59b4f22a..9b028d260649 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 
 struct ctl_table;
 int proc_nr_files(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos);
+		  void *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_dentry(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos);
+		  void *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_inodes(struct ctl_table *table, int write,
-		   void __user *buffer, size_t *lenp, loff_t *ppos);
+		   void *buffer, size_t *lenp, loff_t *ppos);
 int __init get_filesystem_list(char *buf);
 
 #define __FMODE_EXEC		((__force int) FMODE_EXEC)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db95244a62d4..ddfc377de0d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void);
 extern int __disable_trace_on_warning;
 
 int tracepoint_printk_sysctl(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp,
-			     loff_t *ppos);
+			     void *buffer, size_t *lenp, loff_t *ppos);
 
 #else /* CONFIG_TRACING */
 static inline void  disable_trace_on_warning(void) { }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 43a1cef8f0f1..92c21c5ccc58 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
-
-#ifdef CONFIG_NUMA
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-#endif
+int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
 
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04bdaf01112c..594265bfd390 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn);
 #ifdef CONFIG_SYSCTL
 extern int sysctl_kprobes_optimization;
 extern int proc_kprobes_optimization_handler(struct ctl_table *table,
-					     int write, void __user *buffer,
+					     int write, void *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
 extern void wait_for_kprobe_optimizer(void);
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 9022f0c2e2e4..abe3d95f795b 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 
 void clear_tsk_latency_tracing(struct task_struct *p);
 
-extern int sysctl_latencytop(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
 
 #else
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c4e7e76dedd..a7b1ef8ed970 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
 
-extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
-				    size_t *, loff_t *);
-extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
-				    size_t *, loff_t *);
+int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
@@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
 
 #ifdef CONFIG_SYSCTL
 extern int sysctl_drop_caches;
-int drop_caches_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
+int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
 #endif
 
 void drop_slab(void);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b2af594ef0f7..93cf20f41e26 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone)
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
+		size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
+		size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
+		void *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
-			void __user *, size_t *, loff_t *);
+		void *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
-			void __user *, size_t *, loff_t *);
-
-extern int numa_zonelist_order_handler(struct ctl_table *, int,
-			void __user *, size_t *, loff_t *);
+		void *, size_t *, loff_t *);
+int numa_zonelist_order_handler(struct ctl_table *, int,
+		void *, size_t *, loff_t *);
 extern int percpu_pagelist_fraction;
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN	16
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9003e29cde46..750c7f395ca9 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { }
 #endif
 
 struct ctl_table;
-extern int proc_watchdog(struct ctl_table *, int ,
-			 void __user *, size_t *, loff_t *);
-extern int proc_nmi_watchdog(struct ctl_table *, int ,
-			     void __user *, size_t *, loff_t *);
-extern int proc_soft_watchdog(struct ctl_table *, int ,
-			      void __user *, size_t *, loff_t *);
-extern int proc_watchdog_thresh(struct ctl_table *, int ,
-				void __user *, size_t *, loff_t *);
-extern int proc_watchdog_cpumask(struct ctl_table *, int,
-				 void __user *, size_t *, loff_t *);
+int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *);
+int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *);
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9c3e7619c929..347ea379622a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
-extern int perf_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-
+int perf_proc_update_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp, loff_t *ppos);
+		void *buffer, size_t *lenp, loff_t *ppos);
 
 /* Access to perf_event_open(2) syscall. */
 #define PERF_SECURITY_OPEN		0
diff --git a/include/linux/printk.h b/include/linux/printk.h
index e061635e0409..fcde0772ec98 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -189,7 +189,7 @@ extern int printk_delay_msec;
 extern int dmesg_restrict;
 
 extern int
-devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf,
+devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf,
 			  size_t *lenp, loff_t *ppos);
 
 extern void wake_up_klogd(void);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d4f6215ee03f..7b4d3a49b6c5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -12,9 +12,8 @@ extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_check_interval_secs;
 extern int sysctl_hung_task_warnings;
-extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 void __user *buffer,
-					 size_t *lenp, loff_t *ppos);
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 #else
 /* Avoid need for ifdefs elsewhere in the code */
 enum { sysctl_hung_task_timeout_secs = 0 };
@@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *length,
-		loff_t *ppos);
+		void *buffer, size_t *length, loff_t *ppos);
 #endif
 
 /*
@@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled;
 extern int sysctl_sched_rr_timeslice;
 extern int sched_rr_timeslice;
 
-extern int sched_rr_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-
-extern int sched_rt_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-
-#ifdef CONFIG_UCLAMP_TASK
-extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
-				       void __user *buffer, size_t *lenp,
-				       loff_t *ppos);
-#endif
-
-extern int sysctl_numa_balancing(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos);
-
-extern int sysctl_schedstats(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos);
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern unsigned int sysctl_sched_energy_aware;
-extern int sched_energy_aware_handler(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos);
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index a8d9310472df..6aa229b252ce 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -211,7 +211,7 @@ struct request_sock;
 
 #ifdef CONFIG_MMU
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp, loff_t *ppos);
+				 void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 /* security_inode_init_security callback function to write xattrs */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 36143ca40b56..f2401e45a3c2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -44,35 +44,26 @@ struct ctl_dir;
 
 extern const int sysctl_vals[];
 
-typedef int proc_handler (struct ctl_table *ctl, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos);
-
-extern int proc_dostring(struct ctl_table *, int,
-			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int,
-			 void __user *, size_t *, loff_t *);
-extern int proc_douintvec(struct ctl_table *, int,
-			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int,
-				void __user *, size_t *, loff_t *);
-extern int proc_douintvec_minmax(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos);
-extern int proc_dointvec_jiffies(struct ctl_table *, int,
-				 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
-				    void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int,
-				  void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
-				      void __user *, size_t *, loff_t *);
-extern int proc_do_large_bitmap(struct ctl_table *, int,
-				void __user *, size_t *, loff_t *);
-extern int proc_do_static_key(struct ctl_table *table, int write,
-			      void __user *buffer, size_t *lenp,
-			      loff_t *ppos);
+typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+
+int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
+int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *,
+		size_t *, loff_t *);
+int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *);
+int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
@@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 
 #endif /* CONFIG_SYSCTL */
 
-int sysctl_max_threads(struct ctl_table *table, int write,
-		       void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_max_threads(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
 
 #endif /* _LINUX_SYSCTL_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 0dc19a8c39c9..07910ae5ddd9 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -201,8 +201,7 @@ struct ctl_table;
 
 extern unsigned int sysctl_timer_migration;
 int timer_migration_handler(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp,
-			    loff_t *ppos);
+			    void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 unsigned long __round_jiffies(unsigned long j, int cpu);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 292485f3d24d..cb507151710f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -16,8 +16,8 @@ extern int sysctl_stat_interval;
 #define DISABLE_NUMA_STAT   0
 extern int sysctl_vm_numa_stat;
 DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
-extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
-		int write, void __user *buffer, size_t *length, loff_t *ppos);
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *length, loff_t *ppos);
 #endif
 
 struct reclaim_stat {
@@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
 struct ctl_table;
-int vmstat_refresh(struct ctl_table *, int write,
-		   void __user *buffer, size_t *lenp, loff_t *ppos);
+int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
+		loff_t *ppos);
 
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a19d845dd7eb..f8a7e1a850fb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
-extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-extern int dirty_ratio_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
-extern int dirty_bytes_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_ratio_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_bytes_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 int dirtytime_interval_handler(struct ctl_table *table, int write,
-			       void __user *buffer, size_t *lenp, loff_t *ppos);
-
-struct ctl_table;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int,
-				      void __user *, size_t *, loff_t *);
+		void *buffer, size_t *lenp, loff_t *ppos);
+int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index affd66537e87..d1b8644bfb88 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table)
 
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_ipc_dointvec(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 
@@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write,
 }
 
 static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 
@@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
 }
 
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
 	int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 }
 
 static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
@@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
 }
 
 static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	int dummy = 0;
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 7c00f28923a8..72a92a08c848 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table)
 }
 
 static int proc_mq_dointvec(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+			    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
@@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write,
 }
 
 static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
-	void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..977bc69bb1c5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * @head: sysctl table header
  * @table: sysctl table
  * @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
  *	result is size of @new_buf if program set new value, initial value
  *	otherwise
  * @ppos: value-result argument: value is position at which read from or write
  *	to sysctl is happening, result is new position if program overrode it,
  *	initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- *	overrides new value provided by user space on sysctl write
- *	NOTE: it's caller responsibility to free *new_buf if it was set
  * @type: type of program to be executed
  *
  * Program is run when sysctl is being accessed, either read or written, and
@@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  */
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   void __user *buf, size_t *pcount,
-				   loff_t *ppos, void **new_buf,
+				   void **buf, size_t *pcount, loff_t *ppos,
 				   enum bpf_attach_type type)
 {
 	struct bpf_sysctl_kern ctx = {
@@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		.new_updated = 0,
 	};
 	struct cgroup *cgrp;
+	loff_t pos = 0;
 	int ret;
 
 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
-	if (ctx.cur_val) {
-		mm_segment_t old_fs;
-		loff_t pos = 0;
-
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
-					&ctx.cur_len, &pos)) {
-			/* Let BPF program decide how to proceed. */
-			ctx.cur_len = 0;
-		}
-		set_fs(old_fs);
-	} else {
+	if (!ctx.cur_val ||
+	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
 		/* Let BPF program decide how to proceed. */
 		ctx.cur_len = 0;
 	}
 
-	if (write && buf && *pcount) {
+	if (write && *buf && *pcount) {
 		/* BPF program should be able to override new value with a
 		 * buffer bigger than provided by user.
 		 */
 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
-		if (!ctx.new_val ||
-		    copy_from_user(ctx.new_val, buf, ctx.new_len))
+		if (ctx.new_val) {
+			memcpy(ctx.new_val, *buf, ctx.new_len);
+		} else {
 			/* Let BPF program decide how to proceed. */
 			ctx.new_len = 0;
+		}
 	}
 
 	rcu_read_lock();
@@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 	kfree(ctx.cur_val);
 
 	if (ret == 1 && ctx.new_updated) {
-		*new_buf = ctx.new_val;
+		kfree(*buf);
+		*buf = ctx.new_val;
 		*pcount = ctx.new_len;
 	} else {
 		kfree(ctx.new_val);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c2b41a263166..bdb1533ada81 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -236,7 +236,7 @@ exit_put:
  * sysctl_perf_event_max_contexts_per_stack.
  */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp, loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *value = table->data;
 	int new_value = *value, ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc9b98a9af9a..f86d46f2c4d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void)
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 
 int perf_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 
 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00..ffbe03a45c16 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void)
 static DEFINE_MUTEX(kprobe_sysctl_mutex);
 int sysctl_kprobes_optimization;
 int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
-				      void __user *buffer, size_t *length,
+				      void *buffer, size_t *length,
 				      loff_t *ppos)
 {
 	int ret;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 8d1c15832e55..166d7bf49666 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void)
 	return 0;
 }
 
-int sysctl_latencytop(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int err;
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..3ccaba5f15c0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9a9b6156270b..471f649b5868 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg);
 char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
 
 int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+			      void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char old_str[DEVKMSG_STR_MAX_SIZE];
 	unsigned int old;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..5c589a2e4d19 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { }
 #endif
 
 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	bool update_root_tg = false;
 	int old_min, old_max;
@@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled)
 
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	int err;
@@ -2797,8 +2796,8 @@ static void __init init_schedstats(void)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	int err;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85b6d..b6077fd5b32f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  */
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	unsigned int factor = get_update_sysctl_factor();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..45da29de3ecc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void)
 	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
 }
 
-int sched_rt_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
@@ -2754,9 +2753,8 @@ undo:
 	return ret;
 }
 
-int sched_rr_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8344757bba6e..fa64b2ee9fe6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,7 +209,7 @@ bool sched_energy_update;
 
 #ifdef CONFIG_PROC_SYSCTL
 int sched_energy_aware_handler(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret, state;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..d653d8426de9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
 }
 
 static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
-					  void __user *buffer, size_t *lenp,
+					  void *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
 	int ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3fafca3ced98..e961286d0e14 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000;
 #ifdef CONFIG_PROC_SYSCTL
 
 static int _proc_do_string(char *data, int maxlen, int write,
-			   char __user *buffer,
-			   size_t *lenp, loff_t *ppos)
+		char *buffer, size_t *lenp, loff_t *ppos)
 {
 	size_t len;
-	char __user *p;
-	char c;
+	char c, *p;
 
 	if (!data || !maxlen || !*lenp) {
 		*lenp = 0;
@@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write,
 		*ppos += *lenp;
 		p = buffer;
 		while ((p - buffer) < *lenp && len < maxlen - 1) {
-			if (get_user(c, p++))
-				return -EFAULT;
+			c = *(p++);
 			if (c == 0 || c == '\n')
 				break;
 			data[len++] = c;
@@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write,
 		if (len > *lenp)
 			len = *lenp;
 		if (len)
-			if (copy_to_user(buffer, data, len))
-				return -EFAULT;
+			memcpy(buffer, data, len);
 		if (len < *lenp) {
-			if (put_user('\n', buffer + len))
-				return -EFAULT;
+			buffer[len] = '\n';
 			len++;
 		}
 		*lenp = len;
@@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
  * Returns 0 on success.
  */
 int proc_dostring(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (write)
 		proc_first_pos_non_zero_ignore(ppos, table);
 
-	return _proc_do_string((char *)(table->data), table->maxlen, write,
-			       (char __user *)buffer, lenp, ppos);
+	return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+			ppos);
 }
 
 static size_t proc_skip_spaces(char **buf)
@@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size,
  * @val: the integer to be converted
  * @neg: sign of the number, %TRUE for negative
  *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes written.
+ * In case of success @buf and @size are updated with the amount of bytes
+ * written.
  */
-static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
-			  bool neg)
+static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg)
 {
 	int len;
 	char tmp[TMPBUFLEN], *p = tmp;
@@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
 	len = strlen(tmp);
 	if (len > *size)
 		len = *size;
-	if (copy_to_user(*buf, tmp, len))
-		return -EFAULT;
+	memcpy(*buf, tmp, len);
 	*size -= len;
 	*buf += len;
-	return 0;
 }
 #undef TMPBUFLEN
 
-static int proc_put_char(void __user **buf, size_t *size, char c)
+static void proc_put_char(void **buf, size_t *size, char c)
 {
 	if (*size) {
-		char __user **buffer = (char __user **)buf;
-		if (put_user(c, *buffer))
-			return -EFAULT;
-		(*size)--, (*buffer)++;
+		char **buffer = (char **)buf;
+		**buffer = c;
+
+		(*size)--;
+		(*buffer)++;
 		*buf = *buffer;
 	}
-	return 0;
 }
 
 static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
@@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, void __user *buffer,
+		  int write, void *buffer,
 		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
@@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 {
 	int *i, vleft, first = 1, err = 0;
 	size_t left;
-	char *kbuf = NULL, *p;
+	char *p;
 	
 	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
@@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
-		p = kbuf = memdup_user_nul(buffer, left);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
+		p = buffer;
 	}
 
 	for (; left && vleft--; i++, first=0) {
@@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 				break;
 			}
 			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
-			if (err)
-				break;
-			err = proc_put_long(&buffer, &left, lval, neg);
-			if (err)
-				break;
+				proc_put_char(&buffer, &left, '\t');
+			proc_put_long(&buffer, &left, lval, neg);
 		}
 	}
 
 	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
+		proc_put_char(&buffer, &left, '\n');
 	if (write && !err && left)
 		left -= proc_skip_spaces(&p);
-	if (write) {
-		kfree(kbuf);
-		if (first)
-			return err ? : -EINVAL;
-	}
+	if (write && first)
+		return err ? : -EINVAL;
 	*lenp -= left;
 out:
 	*ppos += *lenp;
@@ -623,7 +606,7 @@ out:
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos,
+		  void *buffer, size_t *lenp, loff_t *ppos,
 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
@@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 
 static int do_proc_douintvec_w(unsigned int *tbl_data,
 			       struct ctl_table *table,
-			       void __user *buffer,
+			       void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
 					   unsigned int *valp,
@@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
 	int err = 0;
 	size_t left;
 	bool neg;
-	char *kbuf = NULL, *p;
+	char *p = buffer;
 
 	left = *lenp;
 
@@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
 	if (left > PAGE_SIZE - 1)
 		left = PAGE_SIZE - 1;
 
-	p = kbuf = memdup_user_nul(buffer, left);
-	if (IS_ERR(kbuf))
-		return -EINVAL;
-
 	left -= proc_skip_spaces(&p);
 	if (!left) {
 		err = -EINVAL;
@@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data,
 		left -= proc_skip_spaces(&p);
 
 out_free:
-	kfree(kbuf);
 	if (err)
 		return -EINVAL;
 
@@ -694,7 +672,7 @@ bail_early:
 	return err;
 }
 
-static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
+static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
 					   unsigned int *valp,
@@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
 		goto out;
 	}
 
-	err = proc_put_long(&buffer, &left, lval, false);
-	if (err || !left)
+	proc_put_long(&buffer, &left, lval, false);
+	if (!left)
 		goto out;
 
-	err = proc_put_char(&buffer, &left, '\n');
+	proc_put_char(&buffer, &left, '\n');
 
 out:
 	*lenp -= left;
@@ -726,7 +704,7 @@ out:
 }
 
 static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
-			       int write, void __user *buffer,
+			       int write, void *buffer,
 			       size_t *lenp, loff_t *ppos,
 			       int (*conv)(unsigned long *lvalp,
 					   unsigned int *valp,
@@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
 }
 
 static int do_proc_douintvec(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos,
+			     void *buffer, size_t *lenp, loff_t *ppos,
 			     int (*conv)(unsigned long *lvalp,
 					 unsigned int *valp,
 					 int write, void *data),
@@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_dointvec(struct ctl_table *table, int write,
-		     void __user *buffer, size_t *lenp, loff_t *ppos)
+int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+		  size_t *lenp, loff_t *ppos)
 {
 	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
 }
 
 #ifdef CONFIG_COMPACTION
 static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
-					       int write, void __user *buffer,
-					       size_t *lenp, loff_t *ppos)
+		int write, void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret, old;
 
@@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
  *
  * Returns 0 on success.
  */
-int proc_douintvec(struct ctl_table *table, int write,
-		     void __user *buffer, size_t *lenp, loff_t *ppos)
+int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
 				 do_proc_douintvec_conv, NULL);
@@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write,
  * This means we can safely use a temporary.
  */
 static int proc_taint(struct ctl_table *table, int write,
-			       void __user *buffer, size_t *lenp, loff_t *ppos)
+			       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	unsigned long tmptaint = get_taint();
@@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write,
 
 #ifdef CONFIG_PRINTK
 static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
  * Returns 0 on success or -EINVAL on write when the range check fails.
  */
 int proc_dointvec_minmax(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct do_proc_dointvec_minmax_conv_param param = {
 		.min = (int *) table->extra1,
@@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
  * Returns 0 on success or -ERANGE on write when the range check fails.
  */
 int proc_douintvec_minmax(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct do_proc_douintvec_minmax_conv_param param = {
 		.min = (unsigned int *) table->extra1,
@@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 }
 
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
 				 do_proc_dopipe_max_size_conv, NULL);
@@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void)
 }
 
 static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!error)
@@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
 
 #ifdef CONFIG_COREDUMP
 static int proc_dostring_coredump(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int error = proc_dostring(table, write, buffer, lenp, ppos);
 	if (!error)
@@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 
 #ifdef CONFIG_MAGIC_SYSRQ
 static int sysrq_sysctl_handler(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int tmp, ret;
 
@@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 }
 #endif
 
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
-				     size_t *lenp, loff_t *ppos,
-				     unsigned long convmul,
-				     unsigned long convdiv)
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
+		int write, void *buffer, size_t *lenp, loff_t *ppos,
+		unsigned long convmul, unsigned long convdiv)
 {
 	unsigned long *i, *min, *max;
 	int vleft, first = 1, err = 0;
 	size_t left;
-	char *kbuf = NULL, *p;
+	char *p;
 
 	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
@@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
-		p = kbuf = memdup_user_nul(buffer, left);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
+		p = buffer;
 	}
 
 	for (; left && vleft--; i++, first = 0) {
@@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 			*i = val;
 		} else {
 			val = convdiv * (*i) / convmul;
-			if (!first) {
-				err = proc_put_char(&buffer, &left, '\t');
-				if (err)
-					break;
-			}
-			err = proc_put_long(&buffer, &left, val, false);
-			if (err)
-				break;
+			if (!first)
+				proc_put_char(&buffer, &left, '\t');
+			proc_put_long(&buffer, &left, val, false);
 		}
 	}
 
 	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
+		proc_put_char(&buffer, &left, '\n');
 	if (write && !err)
 		left -= proc_skip_spaces(&p);
-	if (write) {
-		kfree(kbuf);
-		if (first)
-			return err ? : -EINVAL;
-	}
+	if (write && first)
+		return err ? : -EINVAL;
 	*lenp -= left;
 out:
 	*ppos += *lenp;
@@ -1181,10 +1146,8 @@ out:
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
-				     void __user *buffer,
-				     size_t *lenp, loff_t *ppos,
-				     unsigned long convmul,
-				     unsigned long convdiv)
+		void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
+		unsigned long convdiv)
 {
 	return __do_proc_doulongvec_minmax(table->data, table, write,
 			buffer, lenp, ppos, convmul, convdiv);
@@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
  * Returns 0 on success.
  */
 int proc_doulongvec_minmax(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
     return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
 }
@@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write,
  * Returns 0 on success.
  */
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      void __user *buffer,
-				      size_t *lenp, loff_t *ppos)
+				      void *buffer, size_t *lenp, loff_t *ppos)
 {
     return do_proc_doulongvec_minmax(table, write, buffer,
 				     lenp, ppos, HZ, 1000l);
@@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
  * Returns 0 on success.
  */
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
     return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_jiffies_conv,NULL);
@@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
  * Returns 0 on success.
  */
 int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp, loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
     return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
@@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
-static int proc_do_cad_pid(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	struct pid *new_pid;
 	pid_t tmp;
@@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
  * Returns 0 on success.
  */
 int proc_do_large_bitmap(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int err = 0;
 	bool first = 1;
@@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	}
 
 	if (write) {
-		char *kbuf, *p;
+		char *p = buffer;
 		size_t skipped = 0;
 
 		if (left > PAGE_SIZE - 1) {
@@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			skipped = *lenp - left;
 		}
 
-		p = kbuf = memdup_user_nul(buffer, left);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-
 		tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
-		if (!tmp_bitmap) {
-			kfree(kbuf);
+		if (!tmp_bitmap)
 			return -ENOMEM;
-		}
 		proc_skip_char(&p, &left, '\n');
 		while (!err && left) {
 			unsigned long val_a, val_b;
@@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			first = 0;
 			proc_skip_char(&p, &left, '\n');
 		}
-		kfree(kbuf);
 		left += skipped;
 	} else {
 		unsigned long bit_a, bit_b = 0;
@@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			bit_b = find_next_zero_bit(bitmap, bitmap_len,
 						   bit_a + 1) - 1;
 
-			if (!first) {
-				err = proc_put_char(&buffer, &left, ',');
-				if (err)
-					break;
-			}
-			err = proc_put_long(&buffer, &left, bit_a, false);
-			if (err)
-				break;
+			if (!first)
+				proc_put_char(&buffer, &left, ',');
+			proc_put_long(&buffer, &left, bit_a, false);
 			if (bit_a != bit_b) {
-				err = proc_put_char(&buffer, &left, '-');
-				if (err)
-					break;
-				err = proc_put_long(&buffer, &left, bit_b, false);
-				if (err)
-					break;
+				proc_put_char(&buffer, &left, '-');
+				proc_put_long(&buffer, &left, bit_b, false);
 			}
 
 			first = 0; bit_b++;
 		}
-		if (!err)
-			err = proc_put_char(&buffer, &left, '\n');
+		proc_put_char(&buffer, &left, '\n');
 	}
 
 	if (!err) {
@@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 #else /* CONFIG_PROC_SYSCTL */
 
 int proc_dostring(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_dointvec(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_douintvec(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_dointvec_minmax(struct ctl_table *table, int write,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
+		    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_douintvec_minmax(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
+		    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
+		    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_doulongvec_minmax(struct ctl_table *table, int write,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
+		    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      void __user *buffer,
-				      size_t *lenp, loff_t *ppos)
+				      void *buffer, size_t *lenp, loff_t *ppos)
 {
-    return -ENOSYS;
+	return -ENOSYS;
 }
 
 int proc_do_large_bitmap(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
@@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 #if defined(CONFIG_SYSCTL)
 int proc_do_static_key(struct ctl_table *table, int write,
-		       void __user *buffer, size_t *lenp,
-		       loff_t *ppos)
+		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct static_key *key = (struct static_key *)table->data;
 	static DEFINE_MUTEX(static_key_mutex);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4594..398e6eadb861 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -249,8 +249,7 @@ void timers_update_nohz(void)
 }
 
 int timer_migration_handler(struct ctl_table *table, int write,
-			    void __user *buffer, size_t *lenp,
-			    loff_t *ppos)
+			    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d2b98812625..167a74a15b1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
 }
 
 int tracepoint_printk_sysctl(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp,
+			     void *buffer, size_t *lenp,
 			     loff_t *ppos)
 {
 	int save_tracepoint_printk;
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..9788ed481a6a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
 EXPORT_SYMBOL(call_usermodehelper);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3732c888a949..4ca61d49885b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
  *	to observe. Should this be in kernel/sys.c ????
  */
 static int proc_do_uts_string(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table uts_table;
 	int r;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b6b1f54a7837..53ff2c81b084 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -661,7 +661,7 @@ static void proc_watchdog_update(void)
  * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
  */
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int err, old, *param = table->data;
 
@@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
  * /proc/sys/kernel/watchdog
  */
 int proc_watchdog(struct ctl_table *table, int write,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
+		  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
 				    table, write, buffer, lenp, ppos);
@@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write,
  * /proc/sys/kernel/nmi_watchdog
  */
 int proc_nmi_watchdog(struct ctl_table *table, int write,
-		      void __user *buffer, size_t *lenp, loff_t *ppos)
+		      void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!nmi_watchdog_available && write)
 		return -ENOTSUPP;
@@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
  * /proc/sys/kernel/soft_watchdog
  */
 int proc_soft_watchdog(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+			void *buffer, size_t *lenp, loff_t *ppos)
 {
 	return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
 				    table, write, buffer, lenp, ppos);
@@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
  * /proc/sys/kernel/watchdog_thresh
  */
 int proc_watchdog_thresh(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int err, old;
 
@@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
  * been brought online, if desired.
  */
 int proc_watchdog_cpumask(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int err;
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..d8cfb7b99a83 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2463,7 +2463,7 @@ int sysctl_compact_memory;
  * /proc/sys/vm/compact_memory
  */
 int sysctl_compaction_handler(struct ctl_table *table, int write,
-			void __user *buffer, size_t *length, loff_t *ppos)
+			void *buffer, size_t *length, loff_t *ppos)
 {
 	if (write)
 		compact_nodes();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cd459155d28a..2277c5728b1f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 #ifdef CONFIG_SYSCTL
 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 			 struct ctl_table *table, int write,
-			 void __user *buffer, size_t *length, loff_t *ppos)
+			 void *buffer, size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
 	unsigned long tmp = h->max_huge_pages;
@@ -3375,7 +3375,7 @@ out:
 }
 
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *length, loff_t *ppos)
+			  void *buffer, size_t *length, loff_t *ppos)
 {
 
 	return hugetlb_sysctl_handler_common(false, table, write,
@@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 
 #ifdef CONFIG_NUMA
 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *length, loff_t *ppos)
+			  void *buffer, size_t *length, loff_t *ppos)
 {
 	return hugetlb_sysctl_handler_common(true, table, write,
 							buffer, length, ppos);
@@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 #endif /* CONFIG_NUMA */
 
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-			void __user *buffer,
-			size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
 	unsigned long tmp;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7326b54ab728..d3ee4c4dafac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat)
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
@@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
 }
 
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
@@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-int dirty_ratio_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret;
@@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 }
 
 int dirty_bytes_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
@@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	unsigned int old_interval = dirty_writeback_interval;
 	int ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62c1550cd43e..0c43e9ae5004 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node";
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *length,
-		loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
-	char *str;
-	int ret;
-
-	if (!write)
-		return proc_dostring(table, write, buffer, length, ppos);
-	str = memdup_user_nul(buffer, 16);
-	if (IS_ERR(str))
-		return PTR_ERR(str);
-
-	ret = __parse_numa_zonelist_order(str);
-	kfree(str);
-	return ret;
+	if (write)
+		return __parse_numa_zonelist_order(buffer);
+	return proc_dostring(table, write, buffer, length, ppos);
 }
 
 
@@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min)
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 
@@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 }
 
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 
@@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void)
 
 
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 
@@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void)
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 
@@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
@@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone)
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..8defc8ec141f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 
-int overcommit_ratio_handler(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp,
-			     loff_t *ppos)
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
@@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp,
-			     loff_t *ppos)
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 96d21a792b57..c03a8c914922 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -76,7 +76,7 @@ static void invalid_numa_statistics(void)
 static DEFINE_MUTEX(vm_numa_stat_lock);
 
 int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *length, loff_t *ppos)
+		void *buffer, size_t *length, loff_t *ppos)
 {
 	int ret, oldval;
 
@@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work)
 }
 
 int vmstat_refresh(struct ctl_table *table, int write,
-		   void __user *buffer, size_t *lenp, loff_t *ppos)
+		   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	long val;
 	int err;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 59980ecfc962..04c3f9a82650 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 #ifdef CONFIG_SYSCTL
 static
 int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+			    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 39d37d0ef575..3f2263e79e4b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns);
 static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
 
 static int proc_unres_qlen(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int size, ret;
 	struct ctl_table tmp = *ctl;
@@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
 }
 
 static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
-					   void __user *buffer,
-					   size_t *lenp, loff_t *ppos)
+					   void *buffer, size_t *lenp,
+					   loff_t *ppos)
 {
 	struct ctl_table tmp = *ctl;
 	int ret;
@@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	return ret;
 }
 
-int neigh_proc_dointvec(struct ctl_table *ctl, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+			size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
@@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write,
 }
 EXPORT_SYMBOL(neigh_proc_dointvec);
 
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
-				void __user *buffer,
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
 				size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
 EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
 
 static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
-					      void __user *buffer,
-					      size_t *lenp, loff_t *ppos)
+					      void *buffer, size_t *lenp,
+					      loff_t *ppos)
 {
 	int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
 
@@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
 }
 
 int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
-				   void __user *buffer,
-				   size_t *lenp, loff_t *ppos)
+				   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
 
@@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
 EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
 
 static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
-					  void __user *buffer,
-					  size_t *lenp, loff_t *ppos)
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
 {
 	int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
 
@@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
 }
 
 static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
-					  void __user *buffer,
-					  size_t *lenp, loff_t *ppos)
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
 {
 	struct neigh_parms *p = ctl->extra2;
 	int ret;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9f9e00ba3ad7..0ddb13a6282b 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
 
 #ifdef CONFIG_RPS
 static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int orig_size, size;
 	int ret, i;
@@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 static DEFINE_MUTEX(flow_limit_update_mutex);
 
 static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct sd_flow_limit *cur;
 	struct softnet_data *sd;
@@ -180,10 +179,7 @@ write_unlock:
 		}
 		if (len < *lenp)
 			kbuf[len++] = '\n';
-		if (copy_to_user(buffer, kbuf, len)) {
-			ret = -EFAULT;
-			goto done;
-		}
+		memcpy(buffer, kbuf, len);
 		*lenp = len;
 		*ppos += len;
 	}
@@ -194,8 +190,7 @@ done:
 }
 
 static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
-				       void __user *buffer, size_t *lenp,
-				       loff_t *ppos)
+				       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int old, *ptr;
 	int ret;
@@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_NET_SCHED
 static int set_default_qdisc(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char id[IFNAMSIZ];
 	struct ctl_table tbl = {
@@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 #endif
 
 static int proc_do_dev_weight(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
@@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
 }
 
 static int proc_do_rss_key(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table fake_table;
 	char buf[NETDEV_RSS_KEY_LEN * 3];
@@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
 
 #ifdef CONFIG_BPF_JIT
 static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
-					   void __user *buffer, size_t *lenp,
+					   void *buffer, size_t *lenp,
 					   loff_t *ppos)
 {
 	int ret, jit_enable = *(int *)table->data;
@@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
 # ifdef CONFIG_HAVE_EBPF_JIT
 static int
 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
-				    void __user *buffer, size_t *lenp,
-				    loff_t *ppos)
+				    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 static int
 proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
-				     void __user *buffer, size_t *lenp,
-				     loff_t *ppos)
+				     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cca7ae712995..65abcf1b3210 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
 static int min_priority[1];
 static int max_priority[] = { 127 }; /* From DECnet spec */
 
-static int dn_forwarding_proc(struct ctl_table *, int,
-			void __user *, size_t *, loff_t *);
+static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
 static struct dn_dev_sysctl_table {
 	struct ctl_table_header *sysctl_header;
 	struct ctl_table dn_dev_vars[5];
@@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
 }
 
 static int dn_forwarding_proc(struct ctl_table *table, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 #ifdef CONFIG_DECNET_ROUTER
 	struct net_device *dev = table->extra1;
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 55bf64a22b59..deae519bdeec 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str)
 }
 
 static int dn_node_address_handler(struct ctl_table *table, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char addr[DN_ASCBUF_LEN];
 	size_t len;
@@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
 
 	if (write) {
 		len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
-
-		if (copy_from_user(addr, buffer, len))
-			return -EFAULT;
-
+		memcpy(addr, buffer, len);
 		addr[len] = 0;
 		strip_it(addr);
 
@@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
 	len = strlen(addr);
 	addr[len++] = '\n';
 
-	if (len > *lenp) len = *lenp;
-
-	if (copy_to_user(buffer, addr, len))
-		return -EFAULT;
-
+	if (len > *lenp)
+		len = *lenp;
+	memcpy(buffer, addr, len);
 	*lenp = len;
 	*ppos += len;
 
@@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write,
 }
 
 static int dn_def_dev_handler(struct ctl_table *table, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	size_t len;
 	struct net_device *dev;
@@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write,
 		if (*lenp > 16)
 			return -E2BIG;
 
-		if (copy_from_user(devname, buffer, *lenp))
-			return -EFAULT;
-
+		memcpy(devname, buffer, *lenp);
 		devname[*lenp] = 0;
 		strip_it(devname);
 
@@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write,
 
 	if (len > *lenp) len = *lenp;
 
-	if (copy_to_user(buffer, devname, len))
-		return -EFAULT;
-
+	memcpy(buffer, devname, len);
 	*lenp = len;
 	*ppos += len;
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 30fa42f5997d..a118978d222c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
 }
 
 static int devinet_conf_proc(struct ctl_table *ctl, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int old_value = *(int *)ctl->data;
 	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
 }
 
 static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
-				  void __user *buffer,
-				  size_t *lenp, loff_t *ppos)
+				  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
@@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
 }
 
 static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 788c69d9bfe0..041f4dcac440 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
 
 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = (struct net *)__ctl->extra1;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81b267e990a1..868e317cc324 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2])
 
 /* Validate changes from /proc interface. */
 static int ipv4_local_port_range(struct ctl_table *table, int write,
-				 void __user *buffer,
-				 size_t *lenp, loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net =
 		container_of(table->data, struct net, ipv4.ip_local_ports.range);
@@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
 
 /* Validate changes from /proc interface. */
 static int ipv4_privileged_ports(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = container_of(table->data, struct net,
 	    ipv4.sysctl_ip_prot_sock);
@@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
 
 /* Validate changes from /proc interface. */
 static int ipv4_ping_group_range(struct ctl_table *table, int write,
-				 void __user *buffer,
-				 size_t *lenp, loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct user_namespace *user_ns = current_user_ns();
 	int ret;
@@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
 }
 
 static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
-				    void __user *buffer,
-				    size_t *lenp, loff_t *ppos)
+				    void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net;
 	int ret;
@@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
 }
 
 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
-				       void __user *buffer, size_t *lenp, loff_t *ppos)
+				       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = container_of(ctl->data, struct net,
 				       ipv4.tcp_congestion_control);
@@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
 }
 
 static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
-						 int write,
-						 void __user *buffer, size_t *lenp,
-						 loff_t *ppos)
+						 int write, void *buffer,
+						 size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
 	int ret;
@@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
 }
 
 static int proc_allowed_congestion_control(struct ctl_table *ctl,
-					   int write,
-					   void __user *buffer, size_t *lenp,
-					   loff_t *ppos)
+					   int write, void *buffer,
+					   size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
 	int ret;
@@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key)
 }
 
 static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = container_of(table->data, struct net,
 	    ipv4.sysctl_tcp_fastopen);
@@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol)
 }
 
 static int proc_tcp_early_demux(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret = 0;
 
@@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write,
 }
 
 static int proc_udp_early_demux(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret = 0;
 
@@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write,
 }
 
 static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
-					     int write,
-					     void __user *buffer,
+					     int write, void *buffer,
 					     size_t *lenp, loff_t *ppos)
 {
 	struct net *net = container_of(table->data, struct net,
@@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 }
 
 static int proc_tcp_available_ulp(struct ctl_table *ctl,
-				  int write,
-				  void __user *buffer, size_t *lenp,
+				  int write, void *buffer, size_t *lenp,
 				  loff_t *ppos)
 {
 	struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
@@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
+					  void *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
 	struct net *net = container_of(table->data, struct net,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 24e319dfb510..9d0e89bccb90 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 
 #ifdef CONFIG_SYSCTL
 
-static
-int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
-			   void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
@@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
 	return ret;
 }
 
-static
-int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct inet6_dev *idev = ctl->extra1;
 	int min_mtu = IPV6_MIN_MTU;
@@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
 	return 0;
 }
 
-static
-int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
-			    void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
@@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
 	return ret;
 }
 
-static
-int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int ret;
@@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
 }
 
 static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
-					 void __user *buffer, size_t *lenp,
+					 void *buffer, size_t *lenp,
 					 loff_t *ppos)
 {
 	int ret = 0;
@@ -6337,7 +6333,7 @@ out:
 }
 
 static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
-					 void __user *buffer, size_t *lenp,
+					 void *buffer, size_t *lenp,
 					 loff_t *ppos)
 {
 	int err;
@@ -6404,8 +6400,7 @@ out:
 
 static
 int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
-						int write,
-						void __user *buffer,
+						int write, void *buffer,
 						size_t *lenp,
 						loff_t *ppos)
 {
@@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
 	return 0;
 }
 
-static
-int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
-				   void __user *buffer, size_t *lenp,
-				   loff_t *ppos)
+static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
+				   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 1ecd4e9b0bdf..58f1255295d3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
 	}
 }
 
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 310cbddaa533..acdb31e38412 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_SYSCTL
 
-static
-int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
+			      void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net;
 	int delay;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 63b657aa8d29..fac2135aa47b 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -26,8 +26,7 @@ static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 
 static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos)
+					  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net;
 	int ret;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 4701edffb1f7..a42e4ed5ab0e 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1362,8 +1362,7 @@ done:
 	(&((struct mpls_dev *)0)->field)
 
 static int mpls_conf_proc(struct ctl_table *ctl, int write,
-			  void __user *buffer,
-			  size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int oval = *(int *)ctl->data;
 	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2594,7 +2593,7 @@ nolabels:
 }
 
 static int mpls_platform_labels(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = table->data;
 	int platform_labels = net->mpls.platform_labels;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8d14a1acbc37..412656c34f20 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1736,7 +1736,7 @@ static int three = 3;
 
 static int
 proc_do_defense_mode(struct ctl_table *table, int write,
-		     void __user *buffer, size_t *lenp, loff_t *ppos)
+		     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct netns_ipvs *ipvs = table->extra2;
 	int *valp = table->data;
@@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write,
 
 static int
 proc_do_sync_threshold(struct ctl_table *table, int write,
-		       void __user *buffer, size_t *lenp, loff_t *ppos)
+		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int val[2];
@@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
 
 static int
 proc_do_sync_ports(struct ctl_table *table, int write,
-		   void __user *buffer, size_t *lenp, loff_t *ppos)
+		   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int val = *valp;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9b57330c81f8..31b027b12ff3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly;
 
 static int
 nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index bb25d4c794c7..6cb9f9474b05 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
 };
 
 static int nf_log_proc_dostring(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	const struct nf_logger *logger;
 	char buf[NFLOGGER_NAME_LEN];
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 251e750fd9aa..0d0bf41381c2 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max)
 }
 
 static int proc_local_port_range(struct ctl_table *table, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int range[2] = {local_port_range[0], local_port_range[1]};
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 66121bc6f34e..46782fac4c16 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
 static struct kmem_cache *rds_tcp_conn_slab;
 
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *fpos);
+				 void *buffer, size_t *lenp, loff_t *fpos);
 
 static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
 static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 }
 
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
-				 void __user *buffer, size_t *lenp,
-				 loff_t *fpos)
+				 void *buffer, size_t *lenp, loff_t *fpos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	int err;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 4740aa70e652..c16c80963e55 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -43,20 +43,15 @@ static unsigned long max_autoclose_max =
 	? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
 
 static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos);
+				 void *buffer, size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos);
-static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos);
+				void *buffer, size_t *lenp, loff_t *ppos);
+static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer,
+				size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
-				   void __user *buffer, size_t *lenp,
-				   loff_t *ppos);
+				   void *buffer, size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
-			     void __user *buffer, size_t *lenp,
-			     loff_t *ppos);
+			     void *buffer, size_t *lenp, loff_t *ppos);
 
 static struct ctl_table sctp_table[] = {
 	{
@@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = {
 };
 
 static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos)
+				 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct ctl_table tbl;
@@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
 }
 
 static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	unsigned int min = *(unsigned int *) ctl->extra1;
@@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
 }
 
 static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	unsigned int min = *(unsigned int *) ctl->extra1;
@@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
 }
 
 static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
-				   void __user *buffer, size_t *lenp,
-				   loff_t *ppos)
+				   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (write)
 		pr_warn_once("Changing rto_alpha or rto_beta may lead to "
@@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
 }
 
 static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
-			     void __user *buffer, size_t *lenp,
-			     loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct ctl_table tbl;
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index d75f17b56f0e..999eee1ed61c 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -60,7 +60,7 @@ rpc_unregister_sysctl(void)
 }
 
 static int proc_do_xprt(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
+			void *buffer, size_t *lenp, loff_t *ppos)
 {
 	char tmpbuf[256];
 	size_t len;
@@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write,
 		return 0;
 	}
 	len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
-	return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+	return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
 }
 
 static int
-proc_dodebug(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
+	     loff_t *ppos)
 {
-	char		tmpbuf[20], c, *s = NULL;
-	char __user *p;
+	char		tmpbuf[20], *s = NULL;
+	char *p;
 	unsigned int	value;
 	size_t		left, len;
 
@@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write,
 	left = *lenp;
 
 	if (write) {
-		if (!access_ok(buffer, left))
-			return -EFAULT;
 		p = buffer;
-		while (left && __get_user(c, p) >= 0 && isspace(c))
-			left--, p++;
+		while (left && isspace(*p)) {
+			left--;
+			p++;
+		}
 		if (!left)
 			goto done;
 
 		if (left > sizeof(tmpbuf) - 1)
 			return -EINVAL;
-		if (copy_from_user(tmpbuf, p, left))
-			return -EFAULT;
+		memcpy(tmpbuf, p, left);
 		tmpbuf[left] = '\0';
 
 		value = simple_strtol(tmpbuf, &s, 0);
@@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write,
 		len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
 		if (len > left)
 			len = left;
-		if (copy_to_user(buffer, tmpbuf, len))
-			return -EFAULT;
+		memcpy(buffer, tmpbuf, len);
 		if ((left -= len) > 0) {
-			if (put_user('\n', (char __user *)buffer + len))
-				return -EFAULT;
+			*((char *)buffer + len) = '\n';
 			left--;
 		}
 	}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 97bca509a391..526da5d4710b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod;
  * current value.
  */
 static int read_reset_stat(struct ctl_table *table, int write,
-			   void __user *buffer, size_t *lenp,
-			   loff_t *ppos)
+			   void *buffer, size_t *lenp, loff_t *ppos)
 {
 	atomic_t *stat = (atomic_t *)table->data;
 
@@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write,
 		len -= *ppos;
 		if (len > *lenp)
 			len = *lenp;
-		if (len && copy_to_user(buffer, str_buf, len))
-			return -EFAULT;
+		if (len)
+			memcpy(buffer, str_buf, len);
 		*lenp = len;
 		*ppos += len;
 	}
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b621ad74f54a..27e371b44dad 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void)
 
 #ifdef CONFIG_SYSCTL
 static int apparmor_dointvec(struct ctl_table *table, int write,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
+			     void *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!policy_admin_capable(NULL))
 		return -EPERM;
diff --git a/security/min_addr.c b/security/min_addr.c
index 94d2b0cf0e7b..88c9a6a21f47 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -30,7 +30,7 @@ static void update_mmap_min_addr(void)
  * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
  */
 int mmap_min_addr_handler(struct ctl_table *table, int write,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
+			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 94dc346370b1..536c99646f6a 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = {
 
 #ifdef CONFIG_SYSCTL
 static int yama_dointvec_minmax(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp, loff_t *ppos)
+				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table table_copy;
 
-- 
cgit v1.2.3


From e62905ae34eaf5fe2cfb254be5e0c097b3b1f798 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 23 Apr 2020 11:39:20 +0200
Subject: xfrm interface: don't take extra reference to netdev

I don't see any reason to do this. Maybe needed before
commit 56c5ee1a5823 ("xfrm interface: fix memory leak on creation").

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_interface.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 3361e3ac5714..eb9928c0a87c 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -145,7 +145,6 @@ static int xfrmi_create(struct net_device *dev)
 	if (err < 0)
 		goto out;
 
-	dev_hold(dev);
 	xfrmi_link(xfrmn, xi);
 
 	return 0;
@@ -175,7 +174,6 @@ static void xfrmi_dev_uninit(struct net_device *dev)
 	struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
 
 	xfrmi_unlink(xfrmn, xi);
-	dev_put(dev);
 }
 
 static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
-- 
cgit v1.2.3


From 7d3118016787b5c05da94b3bcdb96c9d6ff82c44 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 25 Apr 2020 12:28:14 +0100
Subject: net: rtnetlink: remove redundant assignment to variable err

The variable err is being initializeed with a value that is never read
and it is being updated later with a new value. The initialization
is redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d6f4f4a9e8ba..2269199c5891 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3997,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct ndmsg *ndm;
 	struct nlattr *tb[NDA_MAX+1];
 	struct net_device *dev;
-	int err = -EINVAL;
 	__u8 *addr;
+	int err;
 	u16 vid;
 
 	if (!netlink_capable(skb, CAP_NET_ADMIN))
-- 
cgit v1.2.3


From 2cc974f83fb505751b7fbcf8dee27bdcc7054a7e Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:21:59 +0200
Subject: bridge: mrp: Update Kconfig

Add the option BRIDGE_MRP to allow to build in or not MRP support.
The default value is N.

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/Kconfig | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index e4fb050e2078..51a6414145d2 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -61,3 +61,15 @@ config BRIDGE_VLAN_FILTERING
 	  Say N to exclude this support and reduce the binary size.
 
 	  If unsure, say Y.
+
+config BRIDGE_MRP
+	bool "MRP protocol"
+	depends on BRIDGE
+	default n
+	help
+	  If you say Y here, then the Ethernet bridge will be able to run MRP
+	  protocol to detect loops
+
+	  Say N to exclude this support and reduce the binary size.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 4b8d7d4c599182393421c190bae3604b4db9629a Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:00 +0200
Subject: bridge: mrp: Extend bridge interface

To integrate MRP into the bridge, first the bridge needs to be aware of ports
that are part of an MRP ring and which rings are on the bridge.
Therefore extend bridge interface with the following:
- add new flag(BR_MPP_AWARE) to the net bridge ports, this bit will be
  set when the port is added to an MRP instance. In this way it knows if
  the frame was received on MRP ring port
- add new flag(BR_MRP_LOST_CONT) to the net bridge ports, this bit will be set
  when the port lost the continuity of MRP Test frames.
- add a list of MRP instances

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 2 ++
 net/bridge/br_private.h   | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 9e57c4411734..b3a8d3054af0 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -47,6 +47,8 @@ struct br_ip_list {
 #define BR_BCAST_FLOOD		BIT(14)
 #define BR_NEIGH_SUPPRESS	BIT(15)
 #define BR_ISOLATED		BIT(16)
+#define BR_MRP_AWARE		BIT(17)
+#define BR_MRP_LOST_CONT	BIT(18)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1f97703a52ff..835a70f8d3ea 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -428,6 +428,10 @@ struct net_bridge {
 	int offload_fwd_mark;
 #endif
 	struct hlist_head		fdb_list;
+
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+	struct list_head		__rcu mrp_list;
+#endif
 };
 
 struct br_input_skb_cb {
-- 
cgit v1.2.3


From 3e54442c93845316762b1b3c75e654463fd1b715 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:01 +0200
Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_RING_OPEN

This patch adds a new port attribute, IFLA_BRPORT_MRP_RING_OPEN, which allows
to notify the userspace when the port lost the continuite of MRP frames.

This attribute is set by kernel whenever the SW or HW detects that the ring is
being open or closed.

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h       | 1 +
 net/bridge/br_netlink.c            | 3 +++
 tools/include/uapi/linux/if_link.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 127c704eeba9..a009365ad67b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -343,6 +343,7 @@ enum {
 	IFLA_BRPORT_NEIGH_SUPPRESS,
 	IFLA_BRPORT_ISOLATED,
 	IFLA_BRPORT_BACKUP_PORT,
+	IFLA_BRPORT_MRP_RING_OPEN,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 43dab4066f91..4084f1ef8641 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(sizeof(u8))	/* IFLA_BRPORT_MULTICAST_ROUTER */
 #endif
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_GROUP_FWD_MASK */
+		+ nla_total_size(sizeof(u8))	/* IFLA_BRPORT_MRP_RING_OPEN */
 		+ 0;
 }
 
@@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
 	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
 		       !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags &
+							  BR_MRP_LOST_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
 		return -EMSGSIZE;
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index ca6665ea758a..cafedbbfefbe 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -343,6 +343,7 @@ enum {
 	IFLA_BRPORT_NEIGH_SUPPRESS,
 	IFLA_BRPORT_ISOLATED,
 	IFLA_BRPORT_BACKUP_PORT,
+	IFLA_BRPORT_MRP_RING_OPEN,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit v1.2.3


From 2f1a11ae11d222b3a3b41d09a85cb2bf8f83db49 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:02 +0200
Subject: bridge: mrp: Add MRP interface.

Define the MRP interface.
This interface is used by the netlink to update the MRP instances and by the MRP
to make the calls to switchdev to offload it to HW.

It defines an MRP instance 'struct br_mrp' which is a list of MRP instances.
Which will be part of the 'struct net_bridge'. Each instance has 2 ring ports,
a bridge and an ID.

In case the HW can't generate MRP Test frames then the SW will generate those.

br_mrp_add - adds a new MRP instance.

br_mrp_del - deletes an existing MRP instance. Each instance has an ID(ring_id).

br_mrp_set_port_state - changes the port state. The port can be in forwarding
  state, which means that the frames can pass through or in blocked state which
  means that the frames can't pass through except MRP frames. This will
  eventually call the switchdev API to notify the HW. This information is used
  also by the SW bridge to know how to forward frames in case the HW doesn't
  have this capability.

br_mrp_set_port_role - a port role can be primary or secondary. This
  information is required to be pushed to HW in case the HW can generate
  MRP_Test frames.  Because the MRP_Test frames contains a file with this
  information. Otherwise the HW will not be able to generate the frames
  correctly.

br_mrp_set_ring_state - a ring can be in state open or closed. State open means
  that the mrp port stopped receiving MRP_Test frames, while closed means that
  the mrp port received MRP_Test frames. Similar with br_mrp_port_role, this
  information is pushed in HW because the MRP_Test frames contain this
  information.

br_mrp_set_ring_role - a ring can have the following roles MRM or MRC. For the
  role MRM it is expected that the HW can terminate the MRP frames, notify the
  SW that it stopped receiving MRP_Test frames and trapp all the other MRP
  frames.  While for MRC mode it is expected that the HW can forward the MRP
  frames only between the MRP ports and copy MRP_Topology frames to CPU. In
  case the HW doesn't support a role it needs to return an error code different
  than -EOPNOTSUPP.

br_mrp_start_test - this starts/stops the generation of MRP_Test frames. To stop
  the generation of frames the interval needs to have a value of 0. In this case
  the userspace needs to know if the HW supports this or not. Not to have
  duplicate frames(generated by HW and SW). Because if the HW supports this then
  the SW will not generate anymore frames and will expect that the HW will
  notify when it stopped receiving MRP frames using the function
  br_mrp_port_open.

br_mrp_port_open - this function is used by drivers to notify the userspace via
  a netlink callback that one of the ports stopped receiving MRP_Test frames.
  This function is called only when the node has the role MRM. It is not
  supposed to be called from userspace.

br_mrp_port_switchdev_add - this corresponds to the function br_mrp_add,
  and will notify the HW that a MRP instance is added. The function gets
  as parameter the MRP instance.

br_mrp_port_switchdev_del - this corresponds to the function br_mrp_del,
  and will notify the HW that a MRP instance is removed. The function
  gets as parameter the ID of the MRP instance that is removed.

br_mrp_port_switchdev_set_state - this corresponds to the function
  br_mrp_set_port_state. It would notify the HW if it should block or not
  non-MRP frames.

br_mrp_port_switchdev_set_port - this corresponds to the function
  br_mrp_set_port_role. It would set the port role, primary or secondary.

br_mrp_switchdev_set_role - this corresponds to the function
  br_mrp_set_ring_role and would set one of the role MRM or MRC.

br_mrp_switchdev_set_ring_state - this corresponds to the function
  br_mrp_set_ring_state and would set the ring to be open or closed.

br_mrp_switchdev_send_ring_test - this corresponds to the function
  br_mrp_start_test. This will notify the HW to start or stop generating
  MRP_Test frames. Value 0 for the interval parameter means to stop generating
  the frames.

br_mrp_port_open - this function is used to notify the userspace that the port
  lost the continuity of MRP Test frames.

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private_mrp.h | 63 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 net/bridge/br_private_mrp.h

(limited to 'net')

diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
new file mode 100644
index 000000000000..2921a4b59f8e
--- /dev/null
+++ b/net/bridge/br_private_mrp.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_MRP_H_
+#define _BR_PRIVATE_MRP_H_
+
+#include "br_private.h"
+#include <uapi/linux/mrp_bridge.h>
+
+struct br_mrp {
+	/* list of mrp instances */
+	struct list_head		__rcu list;
+
+	struct net_bridge_port __rcu	*p_port;
+	struct net_bridge_port __rcu	*s_port;
+
+	u32				ring_id;
+
+	enum br_mrp_ring_role_type	ring_role;
+	u8				ring_role_offloaded;
+	enum br_mrp_ring_state_type	ring_state;
+	u32				ring_transitions;
+
+	struct delayed_work		test_work;
+	u32				test_interval;
+	unsigned long			test_end;
+	u32				test_count_miss;
+	u32				test_max_miss;
+
+	u32				seq_id;
+
+	struct rcu_head			rcu;
+};
+
+/* br_mrp.c */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_set_port_state(struct net_bridge_port *p,
+			  enum br_mrp_port_state_type state);
+int br_mrp_set_port_role(struct net_bridge_port *p,
+			 struct br_mrp_port_role *role);
+int br_mrp_set_ring_state(struct net_bridge *br,
+			  struct br_mrp_ring_state *state);
+int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role);
+int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test);
+
+/* br_mrp_switchdev.c */
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp);
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp);
+int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
+				   enum br_mrp_ring_role_type role);
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
+				    enum br_mrp_ring_state_type state);
+int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
+				    u32 interval, u8 max_miss, u32 period);
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
+				    enum br_mrp_port_state_type state);
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+				   enum br_mrp_port_role_type role);
+
+/* br_mrp_netlink.c  */
+int br_mrp_port_open(struct net_device *dev, u8 loc);
+
+#endif /* _BR_PRIVATE_MRP_H */
-- 
cgit v1.2.3


From fadd409136f0f21192d80816edd9529f27d88c17 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:04 +0200
Subject: bridge: switchdev: mrp: Implement MRP API for switchdev

Implement the MRP api for switchdev.
These functions will just eventually call the switchdev functions:
switchdev_port_obj_add/del and switchdev_port_attr_set.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/Makefile           |   2 +
 net/bridge/br_mrp_switchdev.c | 140 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 net/bridge/br_mrp_switchdev.c

(limited to 'net')

diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 49da7ae6f077..3cacf9dd78d5 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -25,3 +25,5 @@ bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_opt
 bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
 obj-$(CONFIG_NETFILTER) += netfilter/
+
+bridge-$(CONFIG_BRIDGE_MRP)	+= br_mrp_switchdev.o
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
new file mode 100644
index 000000000000..51cb1d5a24b4
--- /dev/null
+++ b/net/bridge/br_mrp_switchdev.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/switchdev.h>
+
+#include "br_private_mrp.h"
+
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp)
+{
+	struct switchdev_obj_mrp mrp_obj = {
+		.obj.orig_dev = br->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_MRP,
+		.p_port = rtnl_dereference(mrp->p_port)->dev,
+		.s_port = rtnl_dereference(mrp->s_port)->dev,
+		.ring_id = mrp->ring_id,
+	};
+	int err;
+
+	err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL);
+
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
+
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp)
+{
+	struct switchdev_obj_mrp mrp_obj = {
+		.obj.orig_dev = br->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_MRP,
+		.p_port = NULL,
+		.s_port = NULL,
+		.ring_id = mrp->ring_id,
+	};
+	int err;
+
+	err = switchdev_port_obj_del(br->dev, &mrp_obj.obj);
+
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
+
+int br_mrp_switchdev_set_ring_role(struct net_bridge *br,
+				   struct br_mrp *mrp,
+				   enum br_mrp_ring_role_type role)
+{
+	struct switchdev_obj_ring_role_mrp mrp_role = {
+		.obj.orig_dev = br->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP,
+		.ring_role = role,
+		.ring_id = mrp->ring_id,
+	};
+	int err;
+
+	if (role == BR_MRP_RING_ROLE_DISABLED)
+		err = switchdev_port_obj_del(br->dev, &mrp_role.obj);
+	else
+		err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL);
+
+	return err;
+}
+
+int br_mrp_switchdev_send_ring_test(struct net_bridge *br,
+				    struct br_mrp *mrp, u32 interval,
+				    u8 max_miss, u32 period)
+{
+	struct switchdev_obj_ring_test_mrp test = {
+		.obj.orig_dev = br->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP,
+		.interval = interval,
+		.max_miss = max_miss,
+		.ring_id = mrp->ring_id,
+		.period = period,
+	};
+	int err;
+
+	if (interval == 0)
+		err = switchdev_port_obj_del(br->dev, &test.obj);
+	else
+		err = switchdev_port_obj_add(br->dev, &test.obj, NULL);
+
+	return err;
+}
+
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br,
+				    struct br_mrp *mrp,
+				    enum br_mrp_ring_state_type state)
+{
+	struct switchdev_obj_ring_state_mrp mrp_state = {
+		.obj.orig_dev = br->dev,
+		.obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP,
+		.ring_state = state,
+		.ring_id = mrp->ring_id,
+	};
+	int err;
+
+	err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL);
+
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
+
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
+				    enum br_mrp_port_state_type state)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE,
+		.u.mrp_port_state = state,
+	};
+	int err;
+
+	err = switchdev_port_attr_set(p->dev, &attr);
+	if (err && err != -EOPNOTSUPP)
+		br_warn(p->br, "error setting offload MRP state on port %u(%s)\n",
+			(unsigned int)p->port_no, p->dev->name);
+
+	return err;
+}
+
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+				   enum br_mrp_port_role_type role)
+{
+	struct switchdev_attr attr = {
+		.orig_dev = p->dev,
+		.id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
+		.u.mrp_port_role = role,
+	};
+	int err;
+
+	err = switchdev_port_attr_set(p->dev, &attr);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 9a9f26e8f7ea300e8efffcae036dbef239be433a Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:05 +0200
Subject: bridge: mrp: Connect MRP API with the switchdev API

Implement the MRP API.

In case the HW can't generate MRP Test frames then the SW will try to generate
the frames. In case that also the SW will fail in generating the frames then a
error is return to the userspace. The userspace is responsible to generate all
the other MRP frames regardless if the test frames are generated by HW or SW.

The forwarding/termination of MRP frames is happening in the kernel and is done
by the MRP instance. The userspace application doesn't do the forwarding.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/Makefile         |   2 +-
 net/bridge/br_mrp.c         | 559 ++++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_mrp_netlink.c |  29 +++
 3 files changed, 589 insertions(+), 1 deletion(-)
 create mode 100644 net/bridge/br_mrp.c
 create mode 100644 net/bridge/br_mrp_netlink.c

(limited to 'net')

diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 3cacf9dd78d5..ccb394236fbd 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -26,4 +26,4 @@ bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
 obj-$(CONFIG_NETFILTER) += netfilter/
 
-bridge-$(CONFIG_BRIDGE_MRP)	+= br_mrp_switchdev.o
+bridge-$(CONFIG_BRIDGE_MRP)	+= br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
new file mode 100644
index 000000000000..d7bc09de4c13
--- /dev/null
+++ b/net/bridge/br_mrp.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/mrp_bridge.h>
+#include "br_private_mrp.h"
+
+static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 };
+
+static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br,
+					       u32 ifindex)
+{
+	struct net_bridge_port *res = NULL;
+	struct net_bridge_port *port;
+
+	list_for_each_entry(port, &br->port_list, list) {
+		if (port->dev->ifindex == ifindex) {
+			res = port;
+			break;
+		}
+	}
+
+	return res;
+}
+
+static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
+{
+	struct br_mrp *res = NULL;
+	struct br_mrp *mrp;
+
+	list_for_each_entry_rcu(mrp, &br->mrp_list, list,
+				lockdep_rtnl_is_held()) {
+		if (mrp->ring_id == ring_id) {
+			res = mrp;
+			break;
+		}
+	}
+
+	return res;
+}
+
+static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
+				       struct net_bridge_port *p)
+{
+	struct br_mrp *res = NULL;
+	struct br_mrp *mrp;
+
+	list_for_each_entry_rcu(mrp, &br->mrp_list, list,
+				lockdep_rtnl_is_held()) {
+		if (rcu_access_pointer(mrp->p_port) == p ||
+		    rcu_access_pointer(mrp->s_port) == p) {
+			res = mrp;
+			break;
+		}
+	}
+
+	return res;
+}
+
+static int br_mrp_next_seq(struct br_mrp *mrp)
+{
+	mrp->seq_id++;
+	return mrp->seq_id;
+}
+
+static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p,
+					const u8 *src, const u8 *dst)
+{
+	struct ethhdr *eth_hdr;
+	struct sk_buff *skb;
+	u16 *version;
+
+	skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH);
+	if (!skb)
+		return NULL;
+
+	skb->dev = p->dev;
+	skb->protocol = htons(ETH_P_MRP);
+	skb->priority = MRP_FRAME_PRIO;
+	skb_reserve(skb, sizeof(*eth_hdr));
+
+	eth_hdr = skb_push(skb, sizeof(*eth_hdr));
+	ether_addr_copy(eth_hdr->h_dest, dst);
+	ether_addr_copy(eth_hdr->h_source, src);
+	eth_hdr->h_proto = htons(ETH_P_MRP);
+
+	version = skb_put(skb, sizeof(*version));
+	*version = cpu_to_be16(MRP_VERSION);
+
+	return skb;
+}
+
+static void br_mrp_skb_tlv(struct sk_buff *skb,
+			   enum br_mrp_tlv_header_type type,
+			   u8 length)
+{
+	struct br_mrp_tlv_hdr *hdr;
+
+	hdr = skb_put(skb, sizeof(*hdr));
+	hdr->type = type;
+	hdr->length = length;
+}
+
+static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp)
+{
+	struct br_mrp_common_hdr *hdr;
+
+	br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr));
+
+	hdr = skb_put(skb, sizeof(*hdr));
+	hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp));
+	memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH);
+}
+
+static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp,
+					     struct net_bridge_port *p,
+					     enum br_mrp_port_role_type port_role)
+{
+	struct br_mrp_ring_test_hdr *hdr = NULL;
+	struct sk_buff *skb = NULL;
+
+	if (!p)
+		return NULL;
+
+	skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac);
+	if (!skb)
+		return NULL;
+
+	br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr));
+	hdr = skb_put(skb, sizeof(*hdr));
+
+	hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO);
+	ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
+	hdr->port_role = cpu_to_be16(port_role);
+	hdr->state = cpu_to_be16(mrp->ring_state);
+	hdr->transitions = cpu_to_be16(mrp->ring_transitions);
+	hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies));
+
+	br_mrp_skb_common(skb, mrp);
+	br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0);
+
+	return skb;
+}
+
+static void br_mrp_test_work_expired(struct work_struct *work)
+{
+	struct delayed_work *del_work = to_delayed_work(work);
+	struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work);
+	struct net_bridge_port *p;
+	bool notify_open = false;
+	struct sk_buff *skb;
+
+	if (time_before_eq(mrp->test_end, jiffies))
+		return;
+
+	if (mrp->test_count_miss < mrp->test_max_miss) {
+		mrp->test_count_miss++;
+	} else {
+		/* Notify that the ring is open only if the ring state is
+		 * closed, otherwise it would continue to notify at every
+		 * interval.
+		 */
+		if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED)
+			notify_open = true;
+	}
+
+	rcu_read_lock();
+
+	p = rcu_dereference(mrp->p_port);
+	if (p) {
+		skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY);
+		if (!skb)
+			goto out;
+
+		skb_reset_network_header(skb);
+		dev_queue_xmit(skb);
+
+		if (notify_open && !mrp->ring_role_offloaded)
+			br_mrp_port_open(p->dev, true);
+	}
+
+	p = rcu_dereference(mrp->s_port);
+	if (p) {
+		skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY);
+		if (!skb)
+			goto out;
+
+		skb_reset_network_header(skb);
+		dev_queue_xmit(skb);
+
+		if (notify_open && !mrp->ring_role_offloaded)
+			br_mrp_port_open(p->dev, true);
+	}
+
+out:
+	rcu_read_unlock();
+
+	queue_delayed_work(system_wq, &mrp->test_work,
+			   usecs_to_jiffies(mrp->test_interval));
+}
+
+/* Deletes the MRP instance.
+ * note: called under rtnl_lock
+ */
+static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
+{
+	struct net_bridge_port *p;
+
+	/* Stop sending MRP_Test frames */
+	cancel_delayed_work_sync(&mrp->test_work);
+	br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0);
+
+	br_mrp_switchdev_del(br, mrp);
+
+	/* Reset the ports */
+	p = rtnl_dereference(mrp->p_port);
+	if (p) {
+		spin_lock_bh(&br->lock);
+		p->state = BR_STATE_FORWARDING;
+		p->flags &= ~BR_MRP_AWARE;
+		spin_unlock_bh(&br->lock);
+		br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+		rcu_assign_pointer(mrp->p_port, NULL);
+	}
+
+	p = rtnl_dereference(mrp->s_port);
+	if (p) {
+		spin_lock_bh(&br->lock);
+		p->state = BR_STATE_FORWARDING;
+		p->flags &= ~BR_MRP_AWARE;
+		spin_unlock_bh(&br->lock);
+		br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+		rcu_assign_pointer(mrp->s_port, NULL);
+	}
+
+	list_del_rcu(&mrp->list);
+	kfree_rcu(mrp, rcu);
+}
+
+/* Adds a new MRP instance.
+ * note: called under rtnl_lock
+ */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+	struct net_bridge_port *p;
+	struct br_mrp *mrp;
+	int err;
+
+	/* If the ring exists, it is not possible to create another one with the
+	 * same ring_id
+	 */
+	mrp = br_mrp_find_id(br, instance->ring_id);
+	if (mrp)
+		return -EINVAL;
+
+	if (!br_mrp_get_port(br, instance->p_ifindex) ||
+	    !br_mrp_get_port(br, instance->s_ifindex))
+		return -EINVAL;
+
+	mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
+	if (!mrp)
+		return -ENOMEM;
+
+	mrp->ring_id = instance->ring_id;
+
+	p = br_mrp_get_port(br, instance->p_ifindex);
+	spin_lock_bh(&br->lock);
+	p->state = BR_STATE_FORWARDING;
+	p->flags |= BR_MRP_AWARE;
+	spin_unlock_bh(&br->lock);
+	rcu_assign_pointer(mrp->p_port, p);
+
+	p = br_mrp_get_port(br, instance->s_ifindex);
+	spin_lock_bh(&br->lock);
+	p->state = BR_STATE_FORWARDING;
+	p->flags |= BR_MRP_AWARE;
+	spin_unlock_bh(&br->lock);
+	rcu_assign_pointer(mrp->s_port, p);
+
+	INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired);
+	list_add_tail_rcu(&mrp->list, &br->mrp_list);
+
+	err = br_mrp_switchdev_add(br, mrp);
+	if (err)
+		goto delete_mrp;
+
+	return 0;
+
+delete_mrp:
+	br_mrp_del_impl(br, mrp);
+
+	return err;
+}
+
+/* Deletes the MRP instance from which the port is part of
+ * note: called under rtnl_lock
+ */
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p)
+{
+	struct br_mrp *mrp = br_mrp_find_port(br, p);
+
+	/* If the port is not part of a MRP instance just bail out */
+	if (!mrp)
+		return;
+
+	br_mrp_del_impl(br, mrp);
+}
+
+/* Deletes existing MRP instance based on ring_id
+ * note: called under rtnl_lock
+ */
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+	struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id);
+
+	if (!mrp)
+		return -EINVAL;
+
+	br_mrp_del_impl(br, mrp);
+
+	return 0;
+}
+
+/* Set port state, port state can be forwarding, blocked or disabled
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_state(struct net_bridge_port *p,
+			  enum br_mrp_port_state_type state)
+{
+	if (!p || !(p->flags & BR_MRP_AWARE))
+		return -EINVAL;
+
+	spin_lock_bh(&p->br->lock);
+
+	if (state == BR_MRP_PORT_STATE_FORWARDING)
+		p->state = BR_STATE_FORWARDING;
+	else
+		p->state = BR_STATE_BLOCKING;
+
+	spin_unlock_bh(&p->br->lock);
+
+	br_mrp_port_switchdev_set_state(p, state);
+
+	return 0;
+}
+
+/* Set port role, port role can be primary or secondary
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_role(struct net_bridge_port *p,
+			 struct br_mrp_port_role *role)
+{
+	struct br_mrp *mrp;
+
+	if (!p || !(p->flags & BR_MRP_AWARE))
+		return -EINVAL;
+
+	mrp = br_mrp_find_id(p->br, role->ring_id);
+
+	if (!mrp)
+		return -EINVAL;
+
+	if (role->role == BR_MRP_PORT_ROLE_PRIMARY)
+		rcu_assign_pointer(mrp->p_port, p);
+	else
+		rcu_assign_pointer(mrp->s_port, p);
+
+	br_mrp_port_switchdev_set_role(p, role->role);
+
+	return 0;
+}
+
+/* Set ring state, ring state can be only Open or Closed
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_state(struct net_bridge *br,
+			  struct br_mrp_ring_state *state)
+{
+	struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id);
+
+	if (!mrp)
+		return -EINVAL;
+
+	if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED &&
+	    state->ring_state != BR_MRP_RING_STATE_CLOSED)
+		mrp->ring_transitions++;
+
+	mrp->ring_state = state->ring_state;
+
+	br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state);
+
+	return 0;
+}
+
+/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or
+ * MRC(Media Redundancy Client).
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_role(struct net_bridge *br,
+			 struct br_mrp_ring_role *role)
+{
+	struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id);
+	int err;
+
+	if (!mrp)
+		return -EINVAL;
+
+	mrp->ring_role = role->ring_role;
+
+	/* If there is an error just bailed out */
+	err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
+	/* Now detect if the HW actually applied the role or not. If the HW
+	 * applied the role it means that the SW will not to do those operations
+	 * anymore. For example if the role ir MRM then the HW will notify the
+	 * SW when ring is open, but if the is not pushed to the HW the SW will
+	 * need to detect when the ring is open
+	 */
+	mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1;
+
+	return 0;
+}
+
+/* Start to generate MRP test frames, the frames are generated by HW and if it
+ * fails, they are generated by the SW.
+ * note: already called with rtnl_lock
+ */
+int br_mrp_start_test(struct net_bridge *br,
+		      struct br_mrp_start_test *test)
+{
+	struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id);
+
+	if (!mrp)
+		return -EINVAL;
+
+	/* Try to push it to the HW and if it fails then continue to generate in
+	 * SW and if that also fails then return error
+	 */
+	if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval,
+					     test->max_miss, test->period))
+		return 0;
+
+	mrp->test_interval = test->interval;
+	mrp->test_end = jiffies + usecs_to_jiffies(test->period);
+	mrp->test_max_miss = test->max_miss;
+	mrp->test_count_miss = 0;
+	queue_delayed_work(system_wq, &mrp->test_work,
+			   usecs_to_jiffies(test->interval));
+
+	return 0;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port,
+			       struct sk_buff *skb)
+{
+	const struct br_mrp_tlv_hdr *hdr;
+	struct br_mrp_tlv_hdr _hdr;
+
+	/* Each MRP header starts with a version field which is 16 bits.
+	 * Therefore skip the version and get directly the TLV header.
+	 */
+	hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+	if (!hdr)
+		return;
+
+	if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+		return;
+
+	mrp->test_count_miss = 0;
+
+	/* Notify the userspace that the ring is closed only when the ring is
+	 * not closed
+	 */
+	if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED)
+		br_mrp_port_open(port->dev, false);
+}
+
+/* This will just forward the frame to the other mrp ring port(MRC role) or will
+ * not do anything.
+ * note: already called with rcu_read_lock
+ */
+static int br_mrp_rcv(struct net_bridge_port *p,
+		      struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device *s_dev, *p_dev, *d_dev;
+	struct net_bridge_port *p_port, *s_port;
+	struct net_bridge *br;
+	struct sk_buff *nskb;
+	struct br_mrp *mrp;
+
+	/* If port is disabled don't accept any frames */
+	if (p->state == BR_STATE_DISABLED)
+		return 0;
+
+	br = p->br;
+	mrp =  br_mrp_find_port(br, p);
+	if (unlikely(!mrp))
+		return 0;
+
+	p_port = rcu_dereference(mrp->p_port);
+	if (!p_port)
+		return 0;
+
+	s_port = rcu_dereference(mrp->s_port);
+	if (!s_port)
+		return 0;
+
+	/* If the role is MRM then don't forward the frames */
+	if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) {
+		br_mrp_mrm_process(mrp, p, skb);
+		return 1;
+	}
+
+	/* Clone the frame and forward it on the other MRP port */
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return 0;
+
+	p_dev = p_port->dev;
+	s_dev = s_port->dev;
+
+	if (p_dev == dev)
+		d_dev = s_dev;
+	else
+		d_dev = p_dev;
+
+	nskb->dev = d_dev;
+	skb_push(nskb, ETH_HLEN);
+	dev_queue_xmit(nskb);
+
+	return 1;
+}
+
+/* Check if the frame was received on a port that is part of MRP ring
+ * and if the frame has MRP eth. In that case process the frame otherwise do
+ * normal forwarding.
+ * note: already called with rcu_read_lock
+ */
+int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+{
+	/* If there is no MRP instance do normal forwarding */
+	if (likely(!(p->flags & BR_MRP_AWARE)))
+		goto out;
+
+	if (unlikely(skb->protocol == htons(ETH_P_MRP)))
+		return br_mrp_rcv(p, skb, p->dev);
+
+out:
+	return 0;
+}
+
+bool br_mrp_enabled(struct net_bridge *br)
+{
+	return !list_empty(&br->mrp_list);
+}
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
new file mode 100644
index 000000000000..b982db14bbf4
--- /dev/null
+++ b/net/bridge/br_mrp_netlink.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include <uapi/linux/mrp_bridge.h>
+#include "br_private.h"
+#include "br_private_mrp.h"
+
+int br_mrp_port_open(struct net_device *dev, u8 loc)
+{
+	struct net_bridge_port *p;
+	int err = 0;
+
+	p = br_port_get_rcu(dev);
+	if (!p) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (loc)
+		p->flags |= BR_MRP_LOST_CONT;
+	else
+		p->flags &= ~BR_MRP_LOST_CONT;
+
+	br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+out:
+	return err;
+}
-- 
cgit v1.2.3


From 4d02b8f075153508562803e590f76c4dfe5f4b66 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:06 +0200
Subject: bridge: mrp: Implement netlink interface to configure MRP

Implement netlink interface to configure MRP. The implementation
will do sanity checks over the attributes and then eventually call the MRP
interface.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mrp_netlink.c | 91 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index b982db14bbf4..503896638be0 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -6,6 +6,97 @@
 #include "br_private.h"
 #include "br_private_mrp.h"
 
+static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_INSTANCE]	= { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_mrp_instance)},
+	[IFLA_BRIDGE_MRP_PORT_STATE]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_PORT_ROLE]	= { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_mrp_port_role)},
+	[IFLA_BRIDGE_MRP_RING_STATE]	= { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_mrp_ring_state)},
+	[IFLA_BRIDGE_MRP_RING_ROLE]	= { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_mrp_ring_role)},
+	[IFLA_BRIDGE_MRP_START_TEST]	= { .type = NLA_EXACT_LEN,
+				    .len = sizeof(struct br_mrp_start_test)},
+};
+
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+		 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1];
+	int err;
+
+	if (br->stp_enabled != BR_NO_STP) {
+		NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr,
+			       br_mrp_policy, extack);
+	if (err)
+		return err;
+
+	if (tb[IFLA_BRIDGE_MRP_INSTANCE]) {
+		struct br_mrp_instance *instance =
+			nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]);
+
+		if (cmd == RTM_SETLINK)
+			err = br_mrp_add(br, instance);
+		else
+			err = br_mrp_del(br, instance);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) {
+		enum br_mrp_port_state_type state =
+			nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]);
+
+		err = br_mrp_set_port_state(p, state);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) {
+		struct br_mrp_port_role *role =
+			nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]);
+
+		err = br_mrp_set_port_role(p, role);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_BRIDGE_MRP_RING_STATE]) {
+		struct br_mrp_ring_state *state =
+			nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]);
+
+		err = br_mrp_set_ring_state(br, state);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) {
+		struct br_mrp_ring_role *role =
+			nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]);
+
+		err = br_mrp_set_ring_role(br, role);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_BRIDGE_MRP_START_TEST]) {
+		struct br_mrp_start_test *test =
+			nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]);
+
+		err = br_mrp_start_test(br, test);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 int br_mrp_port_open(struct net_device *dev, u8 loc)
 {
 	struct net_bridge_port *p;
-- 
cgit v1.2.3


From 6536993371fab3de4e8379649b60e94d03e6ff37 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:07 +0200
Subject: bridge: mrp: Integrate MRP into the bridge

To integrate MRP into the bridge, the bridge needs to do the following:
- detect if the MRP frame was received on MRP ring port in that case it would be
  processed otherwise just forward it as usual.
- enable parsing of MRP
- before whenever the bridge was set up, it would set all the ports in
  forwarding state. Add an extra check to not set ports in forwarding state if
  the port is an MRP ring port. The reason of this change is that if the MRP
  instance initially sets the port in blocked state by setting the bridge up it
  would overwrite this setting.

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  |  3 +++
 net/bridge/br_if.c      |  2 ++
 net/bridge/br_input.c   |  3 +++
 net/bridge/br_netlink.c |  5 +++++
 net/bridge/br_private.h | 31 +++++++++++++++++++++++++++++++
 5 files changed, 44 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 0e3dbc5f3c34..8ec1362588af 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -463,6 +463,9 @@ void br_dev_setup(struct net_device *dev)
 	spin_lock_init(&br->lock);
 	INIT_LIST_HEAD(&br->port_list);
 	INIT_HLIST_HEAD(&br->fdb_list);
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+	INIT_LIST_HEAD(&br->mrp_list);
+#endif
 	spin_lock_init(&br->hash_lock);
 
 	br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 4fe30b182ee7..ca685c0cdf95 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -333,6 +333,8 @@ static void del_nbp(struct net_bridge_port *p)
 	br_stp_disable_port(p);
 	spin_unlock_bh(&br->lock);
 
+	br_mrp_port_del(br, p);
+
 	br_ifinfo_notify(RTM_DELLINK, NULL, p);
 
 	list_del_rcu(&p->list);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index fcc260840028..d5c34f36f0f4 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -342,6 +342,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		}
 	}
 
+	if (unlikely(br_mrp_process(p, skb)))
+		return RX_HANDLER_PASS;
+
 forward:
 	switch (p->state) {
 	case BR_STATE_FORWARDING:
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4084f1ef8641..1a5e681a626a 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -672,6 +672,11 @@ static int br_afspec(struct net_bridge *br,
 			if (err)
 				return err;
 			break;
+		case IFLA_BRIDGE_MRP:
+			err = br_mrp_parse(br, p, attr, cmd, extack);
+			if (err)
+				return err;
+			break;
 		}
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 835a70f8d3ea..5835828320b6 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1308,6 +1308,37 @@ unsigned long br_timer_value(const struct timer_list *timer);
 extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
 #endif
 
+/* br_mrp.c */
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+		 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
+bool br_mrp_enabled(struct net_bridge *br);
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p);
+#else
+static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+			       struct nlattr *attr, int cmd,
+			       struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline bool br_mrp_enabled(struct net_bridge *br)
+{
+	return 0;
+}
+
+static inline void br_mrp_port_del(struct net_bridge *br,
+				   struct net_bridge_port *p)
+{
+}
+#endif
+
 /* br_netlink.c */
 extern struct rtnl_link_ops br_link_ops;
 int br_netlink_init(void);
-- 
cgit v1.2.3


From 419dba8a49d7cc355e5b495d20dea8217369ed63 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sun, 26 Apr 2020 15:22:08 +0200
Subject: net: bridge: Add checks for enabling the STP.

It is not possible to have the MRP and STP running at the same time on the
bridge, therefore add check when enabling the STP to check if MRP is already
enabled. In that case return error.

Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_ioctl.c    |  3 +--
 net/bridge/br_netlink.c  |  4 +++-
 net/bridge/br_private.h  |  3 ++-
 net/bridge/br_stp.c      |  6 ++++++
 net/bridge/br_stp_if.c   | 11 ++++++++++-
 net/bridge/br_sysfs_br.c |  4 +---
 6 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index ae22d784b88a..5e71fc8b826f 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -242,8 +242,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		br_stp_set_enabled(br, args[1]);
-		ret = 0;
+		ret = br_stp_set_enabled(br, args[1], NULL);
 		break;
 
 	case BRCTL_SET_BRIDGE_PRIORITY:
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 1a5e681a626a..a774e19c41bb 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1109,7 +1109,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 	if (data[IFLA_BR_STP_STATE]) {
 		u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]);
 
-		br_stp_set_enabled(br, stp_enabled);
+		err = br_stp_set_enabled(br, stp_enabled, extack);
+		if (err)
+			return err;
 	}
 
 	if (data[IFLA_BR_PRIORITY]) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5835828320b6..c35647cb138a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1283,7 +1283,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
 /* br_stp_if.c */
 void br_stp_enable_bridge(struct net_bridge *br);
 void br_stp_disable_bridge(struct net_bridge *br);
-void br_stp_set_enabled(struct net_bridge *br, unsigned long val);
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+		       struct netlink_ext_ack *extack);
 void br_stp_enable_port(struct net_bridge_port *p);
 void br_stp_disable_port(struct net_bridge_port *p);
 bool br_stp_recalculate_bridge_id(struct net_bridge *br);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 1f14b8455345..3e88be7aa269 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -36,6 +36,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 	};
 	int err;
 
+	/* Don't change the state of the ports if they are driven by a different
+	 * protocol.
+	 */
+	if (p->flags & BR_MRP_AWARE)
+		return;
+
 	p->state = state;
 	err = switchdev_port_attr_set(p->dev, &attr);
 	if (err && err != -EOPNOTSUPP)
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d174d3a566aa..a42850b7eb9a 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -196,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br)
 	br->stp_enabled = BR_NO_STP;
 }
 
-void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+		       struct netlink_ext_ack *extack)
 {
 	ASSERT_RTNL();
 
+	if (br_mrp_enabled(br)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "STP can't be enabled if MRP is already enabled\n");
+		return -EINVAL;
+	}
+
 	if (val) {
 		if (br->stp_enabled == BR_NO_STP)
 			br_stp_start(br);
@@ -207,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
 		if (br->stp_enabled != BR_NO_STP)
 			br_stp_stop(br);
 	}
+
+	return 0;
 }
 
 /* called under bridge lock */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 9ab0f00b1081..7db06e3f642a 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -126,9 +126,7 @@ static ssize_t stp_state_show(struct device *d,
 
 static int set_stp_state(struct net_bridge *br, unsigned long val)
 {
-	br_stp_set_enabled(br, val);
-
-	return 0;
+	return br_stp_set_enabled(br, val, NULL);
 }
 
 static ssize_t stp_state_store(struct device *d,
-- 
cgit v1.2.3


From fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2020 21:55:33 +0200
Subject: netfilter: nf_tables: allow up to 64 bytes in the set element data
 area

So far, the set elements could store up to 128-bits in the data area.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nf_tables_api.c     | 38 ++++++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4ff7c81e6717..d4e29c952c40 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -243,6 +243,10 @@ struct nft_set_elem {
 		u32		buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
 		struct nft_data	val;
 	} key_end;
+	union {
+		u32		buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
+		struct nft_data val;
+	} data;
 	void			*priv;
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9780bd93b7e4..3558e76e2733 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4669,6 +4669,25 @@ static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
 	return 0;
 }
 
+static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
+				  struct nft_data_desc *desc,
+				  struct nft_data *data,
+				  struct nlattr *attr)
+{
+	int err;
+
+	err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
+	if (err < 0)
+		return err;
+
+	if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) {
+		nft_data_release(data, desc->type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			    const struct nlattr *attr)
 {
@@ -4946,7 +4965,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_expr *expr = NULL;
 	struct nft_userdata *udata;
 	struct nft_data_desc desc;
-	struct nft_data data;
 	enum nft_registers dreg;
 	struct nft_trans *trans;
 	u32 flags = 0;
@@ -5072,15 +5090,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	}
 
 	if (nla[NFTA_SET_ELEM_DATA] != NULL) {
-		err = nft_data_init(ctx, &data, sizeof(data), &desc,
-				    nla[NFTA_SET_ELEM_DATA]);
+		err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
+					     nla[NFTA_SET_ELEM_DATA]);
 		if (err < 0)
 			goto err_parse_key_end;
 
-		err = -EINVAL;
-		if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen)
-			goto err_parse_data;
-
 		dreg = nft_type_to_reg(set->dtype);
 		list_for_each_entry(binding, &set->bindings, list) {
 			struct nft_ctx bind_ctx = {
@@ -5094,14 +5108,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 				continue;
 
 			err = nft_validate_register_store(&bind_ctx, dreg,
-							  &data,
+							  &elem.data.val,
 							  desc.type, desc.len);
 			if (err < 0)
 				goto err_parse_data;
 
 			if (desc.type == NFT_DATA_VERDICT &&
-			    (data.verdict.code == NFT_GOTO ||
-			     data.verdict.code == NFT_JUMP))
+			    (elem.data.val.verdict.code == NFT_GOTO ||
+			     elem.data.val.verdict.code == NFT_JUMP))
 				nft_validate_state_update(ctx->net,
 							  NFT_VALIDATE_NEED);
 		}
@@ -5123,7 +5137,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 
 	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
-				      elem.key_end.val.data, data.data,
+				      elem.key_end.val.data, elem.data.val.data,
 				      timeout, expiration, GFP_KERNEL);
 	if (elem.priv == NULL)
 		goto err_parse_data;
@@ -5201,7 +5215,7 @@ err_trans:
 	nf_tables_set_elem_destroy(ctx, set, elem.priv);
 err_parse_data:
 	if (nla[NFTA_SET_ELEM_DATA] != NULL)
-		nft_data_release(&data, desc.type);
+		nft_data_release(&elem.data.val, desc.type);
 err_parse_key_end:
 	nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
 err_parse_key:
-- 
cgit v1.2.3


From 0d7c83463fdf7841350f37960a7abadd3e650b41 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2020 21:55:34 +0200
Subject: netfilter: nft_nat: return EOPNOTSUPP if type or flags are not
 supported

Instead of EINVAL which should be used for malformed netlink messages.

Fixes: eb31628e37a0 ("netfilter: nf_tables: Add support for IPv6 NAT")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_nat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 8b44a4de5329..bb49a217635e 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -129,7 +129,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		priv->type = NF_NAT_MANIP_DST;
 		break;
 	default:
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (tb[NFTA_NAT_FAMILY] == NULL)
@@ -196,7 +196,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	if (tb[NFTA_NAT_FLAGS]) {
 		priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
 		if (priv->flags & ~NF_NAT_RANGE_MASK)
-			return -EINVAL;
+			return -EOPNOTSUPP;
 	}
 
 	return nf_ct_netns_get(ctx->net, family);
-- 
cgit v1.2.3


From 4566aa440008103c9bd364f38c03ca5309acc8f4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2020 21:55:35 +0200
Subject: netfilter: nft_nat: set flags from initialization path

This patch sets the NAT flags from the control plane path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_nat.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index bb49a217635e..5c7ff213c030 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -55,7 +55,6 @@ static void nft_nat_eval(const struct nft_expr *expr,
 			       &regs->data[priv->sreg_addr_max],
 			       sizeof(range.max_addr.ip6));
 		}
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
 	}
 
 	if (priv->sreg_proto_min) {
@@ -63,10 +62,9 @@ static void nft_nat_eval(const struct nft_expr *expr,
 			&regs->data[priv->sreg_proto_min]);
 		range.max_proto.all = (__force __be16)nft_reg_load16(
 			&regs->data[priv->sreg_proto_max]);
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 	}
 
-	range.flags |= priv->flags;
+	range.flags = priv->flags;
 
 	regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type);
 }
@@ -169,6 +167,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		} else {
 			priv->sreg_addr_max = priv->sreg_addr_min;
 		}
+
+		priv->flags |= NF_NAT_RANGE_MAP_IPS;
 	}
 
 	plen = sizeof_field(struct nf_nat_range, min_addr.all);
@@ -191,10 +191,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		} else {
 			priv->sreg_proto_max = priv->sreg_proto_min;
 		}
+
+		priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 	}
 
 	if (tb[NFTA_NAT_FLAGS]) {
-		priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
+		priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
 		if (priv->flags & ~NF_NAT_RANGE_MASK)
 			return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From acd766e31bb96b90c2dc4954f86e573c9ac16c66 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2020 21:55:36 +0200
Subject: netfilter: nft_nat: add helper function to set up NAT address and
 protocol

This patch add nft_nat_setup_addr() and nft_nat_setup_proto() to set up
the NAT mangling.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_nat.c | 56 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 5c7ff213c030..7442aa8b1555 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -30,6 +30,36 @@ struct nft_nat {
 	u16			flags;
 };
 
+static void nft_nat_setup_addr(struct nf_nat_range2 *range,
+			       const struct nft_regs *regs,
+			       const struct nft_nat *priv)
+{
+	switch (priv->family) {
+	case AF_INET:
+		range->min_addr.ip = (__force __be32)
+				regs->data[priv->sreg_addr_min];
+		range->max_addr.ip = (__force __be32)
+				regs->data[priv->sreg_addr_max];
+		break;
+	case AF_INET6:
+		memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min],
+		       sizeof(range->min_addr.ip6));
+		memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max],
+		       sizeof(range->max_addr.ip6));
+		break;
+	}
+}
+
+static void nft_nat_setup_proto(struct nf_nat_range2 *range,
+				const struct nft_regs *regs,
+				const struct nft_nat *priv)
+{
+	range->min_proto.all = (__force __be16)
+		nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+	range->max_proto.all = (__force __be16)
+		nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+}
+
 static void nft_nat_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
@@ -40,29 +70,11 @@ static void nft_nat_eval(const struct nft_expr *expr,
 	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		if (priv->family == AF_INET) {
-			range.min_addr.ip = (__force __be32)
-					regs->data[priv->sreg_addr_min];
-			range.max_addr.ip = (__force __be32)
-					regs->data[priv->sreg_addr_max];
-
-		} else {
-			memcpy(range.min_addr.ip6,
-			       &regs->data[priv->sreg_addr_min],
-			       sizeof(range.min_addr.ip6));
-			memcpy(range.max_addr.ip6,
-			       &regs->data[priv->sreg_addr_max],
-			       sizeof(range.max_addr.ip6));
-		}
-	}
+	if (priv->sreg_addr_min)
+		nft_nat_setup_addr(&range, regs, priv);
 
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_min]);
-		range.max_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_max]);
-	}
+	if (priv->sreg_proto_min)
+		nft_nat_setup_proto(&range, regs, priv);
 
 	range.flags = priv->flags;
 
-- 
cgit v1.2.3


From 3ff7ddb1353da9b535e65702704cbadea1da9a00 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2020 21:55:37 +0200
Subject: netfilter: nft_nat: add netmap support

This patch allows you to NAT the network address prefix onto another
network address prefix, a.k.a. netmapping.

Userspace must specify the NF_NAT_RANGE_NETMAP flag and the prefix
address through the NFTA_NAT_REG_ADDR_MIN and NFTA_NAT_REG_ADDR_MAX
netlink attributes.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_nat.h |  4 ++-
 net/netfilter/nft_nat.c               | 46 ++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index 4a95c0db14d4..a64586e77b24 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -11,6 +11,7 @@
 #define NF_NAT_RANGE_PERSISTENT			(1 << 3)
 #define NF_NAT_RANGE_PROTO_RANDOM_FULLY		(1 << 4)
 #define NF_NAT_RANGE_PROTO_OFFSET		(1 << 5)
+#define NF_NAT_RANGE_NETMAP			(1 << 6)
 
 #define NF_NAT_RANGE_PROTO_RANDOM_ALL		\
 	(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -18,7 +19,8 @@
 #define NF_NAT_RANGE_MASK					\
 	(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |	\
 	 NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |	\
-	 NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET)
+	 NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \
+	 NF_NAT_RANGE_NETMAP)
 
 struct nf_nat_ipv4_range {
 	unsigned int			flags;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 7442aa8b1555..23a7bfd10521 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -60,6 +60,46 @@ static void nft_nat_setup_proto(struct nf_nat_range2 *range,
 		nft_reg_load16(&regs->data[priv->sreg_proto_max]);
 }
 
+static void nft_nat_setup_netmap(struct nf_nat_range2 *range,
+				 const struct nft_pktinfo *pkt,
+				 const struct nft_nat *priv)
+{
+	struct sk_buff *skb = pkt->skb;
+	union nf_inet_addr new_addr;
+	__be32 netmask;
+	int i, len = 0;
+
+	switch (priv->type) {
+	case NFT_NAT_SNAT:
+		if (nft_pf(pkt) == NFPROTO_IPV4) {
+			new_addr.ip = ip_hdr(skb)->saddr;
+			len = sizeof(struct in_addr);
+		} else {
+			new_addr.in6 = ipv6_hdr(skb)->saddr;
+			len = sizeof(struct in6_addr);
+		}
+		break;
+	case NFT_NAT_DNAT:
+		if (nft_pf(pkt) == NFPROTO_IPV4) {
+			new_addr.ip = ip_hdr(skb)->daddr;
+			len = sizeof(struct in_addr);
+		} else {
+			new_addr.in6 = ipv6_hdr(skb)->daddr;
+			len = sizeof(struct in6_addr);
+		}
+		break;
+	}
+
+	for (i = 0; i < len / sizeof(__be32); i++) {
+		netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]);
+		new_addr.ip6[i] &= ~netmask;
+		new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask;
+	}
+
+	range->min_addr = new_addr;
+	range->max_addr = new_addr;
+}
+
 static void nft_nat_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
@@ -70,8 +110,12 @@ static void nft_nat_eval(const struct nft_expr *expr,
 	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min)
+
+	if (priv->sreg_addr_min) {
 		nft_nat_setup_addr(&range, regs, priv);
+		if (priv->flags & NF_NAT_RANGE_NETMAP)
+			nft_nat_setup_netmap(&range, pkt, priv);
+	}
 
 	if (priv->sreg_proto_min)
 		nft_nat_setup_proto(&range, regs, priv);
-- 
cgit v1.2.3


From 0146dca70b877b73c5fd9c67912b8a0ca8a7bac7 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 27 Apr 2020 17:59:34 +0200
Subject: xfrm: add support for UDPv6 encapsulation of ESP

This patch adds support for encapsulation of ESP over UDPv6. The code
is very similar to the IPv4 encapsulation implementation, and allows
to easily add espintcp on IPv6 as a follow-up.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ipv6_stubs.h  |   3 +
 include/net/xfrm.h        |   5 +
 net/ipv4/udp.c            |  10 +-
 net/ipv6/af_inet6.c       |   4 +
 net/ipv6/ah6.c            |   1 +
 net/ipv6/esp6.c           | 226 +++++++++++++++++++++++++++++++++++++++++-----
 net/ipv6/esp6_offload.c   |   7 +-
 net/ipv6/ip6_vti.c        |  18 +++-
 net/ipv6/ipcomp6.c        |   1 +
 net/ipv6/xfrm6_input.c    | 106 +++++++++++++++++++++-
 net/ipv6/xfrm6_protocol.c |  48 ++++++++++
 net/xfrm/xfrm_interface.c |   3 +
 12 files changed, 395 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 3e7d2c0e79ca..f033a17b53b6 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -56,6 +56,9 @@ struct ipv6_stub {
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
 			      bool router, bool solicited, bool override, bool inc_opt);
+#if IS_ENABLED(CONFIG_XFRM)
+	int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb);
+#endif
 	struct neigh_table *nd_tbl;
 };
 extern const struct ipv6_stub *ipv6_stub __read_mostly;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8f71c111e65a..2577666c34c8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1406,6 +1406,8 @@ struct xfrm4_protocol {
 
 struct xfrm6_protocol {
 	int (*handler)(struct sk_buff *skb);
+	int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
+			     int encap_type);
 	int (*cb_handler)(struct sk_buff *skb, int err);
 	int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			   u8 type, u8 code, int offset, __be32 info);
@@ -1590,6 +1592,8 @@ int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
 		  struct ip6_tnl *t);
+int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+		    int encap_type);
 int xfrm6_transport_finish(struct sk_buff *skb, int async);
 int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
 int xfrm6_rcv(struct sk_buff *skb);
@@ -1610,6 +1614,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
 
 #ifdef CONFIG_XFRM
 int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
+int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
 int xfrm_user_policy(struct sock *sk, int optname,
 		     u8 __user *optval, int optlen);
 #else
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32564b350823..1b7ebbcae497 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -112,6 +112,9 @@
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
 #include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6_stubs.h>
+#endif
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -2563,7 +2566,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 #ifdef CONFIG_XFRM
 		case UDP_ENCAP_ESPINUDP:
 		case UDP_ENCAP_ESPINUDP_NON_IKE:
-			up->encap_rcv = xfrm4_udp_encap_rcv;
+#if IS_ENABLED(CONFIG_IPV6)
+			if (sk->sk_family == AF_INET6)
+				up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv;
+			else
+#endif
+				up->encap_rcv = xfrm4_udp_encap_rcv;
 #endif
 			fallthrough;
 		case UDP_ENCAP_L2TPINUDP:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 345baa0a754f..b0b99c08350a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -60,6 +60,7 @@
 #include <net/calipso.h>
 #include <net/seg6.h>
 #include <net/rpl.h>
+#include <net/xfrm.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
@@ -961,6 +962,9 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ip6_del_rt	   = ip6_del_rt,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
+#if IS_ENABLED(CONFIG_XFRM)
+	.xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
+#endif
 	.nd_tbl	= &nd_tbl,
 };
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 45e2adc56610..d88d97617f7e 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -767,6 +767,7 @@ static const struct xfrm_type ah6_type = {
 
 static struct xfrm6_protocol ah6_protocol = {
 	.handler	=	xfrm6_rcv,
+	.input_handler	=	xfrm_input,
 	.cb_handler	=	ah6_rcv_cb,
 	.err_handler	=	ah6_err,
 	.priority	=	0,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 11143d039f16..e8800968e209 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -26,10 +26,12 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <net/ip6_checksum.h>
 #include <net/ip6_route.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
 #include <net/protocol.h>
+#include <net/udp.h>
 #include <linux/icmpv6.h>
 
 #include <linux/highmem.h>
@@ -39,6 +41,11 @@ struct esp_skb_cb {
 	void *tmp;
 };
 
+struct esp_output_extra {
+	__be32 seqhi;
+	u32 esphoff;
+};
+
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
 /*
@@ -72,9 +79,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
 	return kmalloc(len, GFP_ATOMIC);
 }
 
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
 {
-	return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+	return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
 }
 
 static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
@@ -104,16 +111,17 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 
 static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
 {
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
 	struct crypto_aead *aead = x->data;
-	int seqhilen = 0;
+	int extralen = 0;
 	u8 *iv;
 	struct aead_request *req;
 	struct scatterlist *sg;
 
 	if (x->props.flags & XFRM_STATE_ESN)
-		seqhilen += sizeof(__be32);
+		extralen += sizeof(*extra);
 
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	iv = esp_tmp_iv(aead, tmp, extralen);
 	req = esp_tmp_req(aead, iv);
 
 	/* Unref skb_frag_pages in the src scatterlist if necessary.
@@ -124,6 +132,23 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
 			put_page(sg_page(sg));
 }
 
+static void esp_output_encap_csum(struct sk_buff *skb)
+{
+	/* UDP encap with IPv6 requires a valid checksum */
+	if (*skb_mac_header(skb) == IPPROTO_UDP) {
+		struct udphdr *uh = udp_hdr(skb);
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		int len = ntohs(uh->len);
+		unsigned int offset = skb_transport_offset(skb);
+		__wsum csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+		uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					    len, IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	}
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -143,6 +168,8 @@ static void esp_output_done(struct crypto_async_request *base, int err)
 	esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
+	esp_output_encap_csum(skb);
+
 	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
 		if (err) {
 			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
@@ -163,7 +190,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
 {
 	struct ip_esp_hdr *esph = (void *)(skb->data + offset);
 	void *tmp = ESP_SKB_CB(skb)->tmp;
-	__be32 *seqhi = esp_tmp_seqhi(tmp);
+	__be32 *seqhi = esp_tmp_extra(tmp);
 
 	esph->seq_no = esph->spi;
 	esph->spi = *seqhi;
@@ -171,27 +198,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
 
 static void esp_output_restore_header(struct sk_buff *skb)
 {
-	esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+	void *tmp = ESP_SKB_CB(skb)->tmp;
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+	esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+				sizeof(__be32));
 }
 
 static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
 					     struct xfrm_state *x,
 					     struct ip_esp_hdr *esph,
-					     __be32 *seqhi)
+					     struct esp_output_extra *extra)
 {
 	/* For ESN we move the header forward by 4 bytes to
 	 * accomodate the high bits.  We will move it back after
 	 * encryption.
 	 */
 	if ((x->props.flags & XFRM_STATE_ESN)) {
+		__u32 seqhi;
 		struct xfrm_offload *xo = xfrm_offload(skb);
 
-		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
-		*seqhi = esph->spi;
 		if (xo)
-			esph->seq_no = htonl(xo->seq.hi);
+			seqhi = xo->seq.hi;
 		else
-			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+			seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(seqhi);
 	}
 
 	esph->spi = x->id.spi;
@@ -207,15 +243,84 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
+					       int encap_type,
+					       struct esp_info *esp,
+					       __be16 sport,
+					       __be16 dport)
+{
+	struct udphdr *uh;
+	__be32 *udpdata32;
+	unsigned int len;
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+	if (len > U16_MAX)
+		return ERR_PTR(-EMSGSIZE);
+
+	uh = (struct udphdr *)esp->esph;
+	uh->source = sport;
+	uh->dest = dport;
+	uh->len = htons(len);
+	uh->check = 0;
+
+	*skb_mac_header(skb) = IPPROTO_UDP;
+
+	if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
+		udpdata32 = (__be32 *)(uh + 1);
+		udpdata32[0] = udpdata32[1] = 0;
+		return (struct ip_esp_hdr *)(udpdata32 + 2);
+	}
+
+	return (struct ip_esp_hdr *)(uh + 1);
+}
+
+static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+			    struct esp_info *esp)
+{
+	struct xfrm_encap_tmpl *encap = x->encap;
+	struct ip_esp_hdr *esph;
+	__be16 sport, dport;
+	int encap_type;
+
+	spin_lock_bh(&x->lock);
+	sport = encap->encap_sport;
+	dport = encap->encap_dport;
+	encap_type = encap->encap_type;
+	spin_unlock_bh(&x->lock);
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
+		break;
+	}
+
+	if (IS_ERR(esph))
+		return PTR_ERR(esph);
+
+	esp->esph = esph;
+
+	return 0;
+}
+
 int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
 {
 	u8 *tail;
 	u8 *vaddr;
 	int nfrags;
+	int esph_offset;
 	struct page *page;
 	struct sk_buff *trailer;
 	int tailen = esp->tailen;
 
+	if (x->encap) {
+		int err = esp6_output_encap(x, skb, esp);
+
+		if (err < 0)
+			return err;
+	}
+
 	if (!skb_cloned(skb)) {
 		if (tailen <= skb_tailroom(skb)) {
 			nfrags = 1;
@@ -274,10 +379,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	}
 
 cow:
+	esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
 	nfrags = skb_cow_data(skb, tailen, &trailer);
 	if (nfrags < 0)
 		goto out;
 	tail = skb_tail_pointer(trailer);
+	esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
 
 skip_cow:
 	esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
@@ -295,20 +403,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	void *tmp;
 	int ivlen;
 	int assoclen;
-	int seqhilen;
-	__be32 *seqhi;
+	int extralen;
 	struct page *page;
 	struct ip_esp_hdr *esph;
 	struct aead_request *req;
 	struct crypto_aead *aead;
 	struct scatterlist *sg, *dsg;
+	struct esp_output_extra *extra;
 	int err = -ENOMEM;
 
 	assoclen = sizeof(struct ip_esp_hdr);
-	seqhilen = 0;
+	extralen = 0;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
-		seqhilen += sizeof(__be32);
+		extralen += sizeof(*extra);
 		assoclen += sizeof(__be32);
 	}
 
@@ -316,12 +424,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	alen = crypto_aead_authsize(aead);
 	ivlen = crypto_aead_ivsize(aead);
 
-	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen);
+	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
 	if (!tmp)
 		goto error;
 
-	seqhi = esp_tmp_seqhi(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
@@ -330,7 +438,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	else
 		dsg = &sg[esp->nfrags];
 
-	esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi);
+	esph = esp_output_set_esn(skb, x, esp->esph, extra);
+	esp->esph = esph;
 
 	sg_init_table(sg, esp->nfrags);
 	err = skb_to_sgvec(skb, sg,
@@ -394,6 +503,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	case 0:
 		if ((x->props.flags & XFRM_STATE_ESN))
 			esp_output_restore_header(skb);
+		esp_output_encap_csum(skb);
 	}
 
 	if (sg != dsg)
@@ -438,11 +548,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	esp.plen = esp.clen - skb->len - esp.tfclen;
 	esp.tailen = esp.tfclen + esp.plen + alen;
 
+	esp.esph = ip_esp_hdr(skb);
+
 	esp.nfrags = esp6_output_head(x, skb, &esp);
 	if (esp.nfrags < 0)
 		return esp.nfrags;
 
-	esph = ip_esp_hdr(skb);
+	esph = esp.esph;
 	esph->spi = x->id.spi;
 
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
@@ -517,6 +629,56 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 	if (unlikely(err < 0))
 		goto out;
 
+	if (x->encap) {
+		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		struct xfrm_encap_tmpl *encap = x->encap;
+		struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len);
+		__be16 source;
+
+		switch (x->encap->encap_type) {
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			source = uh->source;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * 1) if the NAT-T peer's IP or port changed then
+		 *    advertize the change to the keying daemon.
+		 *    This is an inbound SA, so just compare
+		 *    SRC ports.
+		 */
+		if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
+		    source != encap->encap_sport) {
+			xfrm_address_t ipaddr;
+
+			memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
+			km_new_mapping(x, &ipaddr, source);
+
+			/* XXX: perhaps add an extra
+			 * policy check here, to see
+			 * if we should allow or
+			 * reject a packet from a
+			 * different source
+			 * address/port.
+			 */
+		}
+
+		/*
+		 * 2) ignore UDP/TCP checksums in case
+		 *    of NAT-T in Transport Mode, or
+		 *    perform other post-processing fixes
+		 *    as per draft-ietf-ipsec-udp-encaps-06,
+		 *    section 3.1.2
+		 */
+		if (x->props.mode == XFRM_MODE_TRANSPORT)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
 	skb_postpull_rcsum(skb, skb_network_header(skb),
 			   skb_network_header_len(skb));
 	skb_pull_rcsum(skb, hlen);
@@ -632,7 +794,7 @@ skip_cow:
 		goto out;
 
 	ESP_SKB_CB(skb)->tmp = tmp;
-	seqhi = esp_tmp_seqhi(tmp);
+	seqhi = esp_tmp_extra(tmp);
 	iv = esp_tmp_iv(aead, tmp, seqhilen);
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
@@ -836,9 +998,6 @@ static int esp6_init_state(struct xfrm_state *x)
 	u32 align;
 	int err;
 
-	if (x->encap)
-		return -EINVAL;
-
 	x->data = NULL;
 
 	if (x->aead)
@@ -867,6 +1026,22 @@ static int esp6_init_state(struct xfrm_state *x)
 		break;
 	}
 
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+
+		switch (encap->encap_type) {
+		default:
+			err = -EINVAL;
+			goto error;
+		case UDP_ENCAP_ESPINUDP:
+			x->props.header_len += sizeof(struct udphdr);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+			break;
+		}
+	}
+
 	align = ALIGN(crypto_aead_blocksize(aead), 4);
 	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
 
@@ -893,6 +1068,7 @@ static const struct xfrm_type esp6_type = {
 
 static struct xfrm6_protocol esp6_protocol = {
 	.handler	=	xfrm6_rcv,
+	.input_handler	=	xfrm_input,
 	.cb_handler	=	esp6_rcv_cb,
 	.err_handler	=	esp6_err,
 	.priority	=	0,
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 8eab2c869d61..06163cc15844 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -271,7 +271,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 	int alen;
 	int blksize;
 	struct xfrm_offload *xo;
-	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct esp_info esp;
 	bool hw_offload = true;
@@ -312,13 +311,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 
 	seq = xo->seq.low;
 
-	esph = ip_esp_hdr(skb);
-	esph->spi = x->id.spi;
+	esp.esph = ip_esp_hdr(skb);
+	esp.esph->spi = x->id.spi;
 
 	skb_push(skb, -skb_network_offset(skb));
 
 	if (xo->flags & XFRM_GSO_SEGMENT) {
-		esph->seq_no = htonl(seq);
+		esp.esph->seq_no = htonl(seq);
 
 		if (!skb_is_gso(skb))
 			xo->seq.low++;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index cc6180e08a4f..1147f647b9a0 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -296,7 +296,8 @@ static void vti6_dev_uninit(struct net_device *dev)
 	dev_put(dev);
 }
 
-static int vti6_rcv(struct sk_buff *skb)
+static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+			    int encap_type)
 {
 	struct ip6_tnl *t;
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
@@ -323,7 +324,10 @@ static int vti6_rcv(struct sk_buff *skb)
 
 		rcu_read_unlock();
 
-		return xfrm6_rcv_tnl(skb, t);
+		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+		XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+		XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+		return xfrm_input(skb, nexthdr, spi, encap_type);
 	}
 	rcu_read_unlock();
 	return -EINVAL;
@@ -332,6 +336,13 @@ discard:
 	return 0;
 }
 
+static int vti6_rcv(struct sk_buff *skb)
+{
+	int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff];
+
+	return vti6_input_proto(skb, nexthdr, 0, 0);
+}
+
 static int vti6_rcv_cb(struct sk_buff *skb, int err)
 {
 	unsigned short family;
@@ -1185,6 +1196,7 @@ static struct pernet_operations vti6_net_ops = {
 
 static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
 	.handler	=	vti6_rcv,
+	.input_handler	=	vti6_input_proto,
 	.cb_handler	=	vti6_rcv_cb,
 	.err_handler	=	vti6_err,
 	.priority	=	100,
@@ -1192,6 +1204,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
 
 static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
 	.handler	=	vti6_rcv,
+	.input_handler	=	vti6_input_proto,
 	.cb_handler	=	vti6_rcv_cb,
 	.err_handler	=	vti6_err,
 	.priority	=	100,
@@ -1199,6 +1212,7 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
 
 static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
 	.handler	=	vti6_rcv,
+	.input_handler	=	vti6_input_proto,
 	.cb_handler	=	vti6_rcv_cb,
 	.err_handler	=	vti6_err,
 	.priority	=	100,
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 3752bd3e92ce..99668bfebd85 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -183,6 +183,7 @@ static const struct xfrm_type ipcomp6_type = {
 
 static struct xfrm6_protocol ipcomp6_protocol = {
 	.handler	= xfrm6_rcv,
+	.input_handler	= xfrm_input,
 	.cb_handler	= ipcomp6_rcv_cb,
 	.err_handler	= ipcomp6_err,
 	.priority	= 0,
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index a52cb3fc6df5..56f52353b324 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -35,9 +35,12 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
 static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
 				   struct sk_buff *skb)
 {
-	if (xfrm_trans_queue(skb, ip6_rcv_finish))
-		__kfree_skb(skb);
-	return -1;
+	if (xfrm_trans_queue(skb, ip6_rcv_finish)) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return 0;
 }
 
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
@@ -60,13 +63,106 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	if (xo && (xo->flags & XFRM_GRO)) {
 		skb_mac_header_rebuild(skb);
 		skb_reset_transport_header(skb);
-		return -1;
+		return 0;
 	}
 
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		xfrm6_transport_finish2);
-	return -1;
+	return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+	struct udphdr *uh;
+	struct ipv6hdr *ip6h;
+	int len;
+	int ip6hlen = sizeof(struct ipv6hdr);
+
+	__u8 *udpdata;
+	__be32 *udpdata32;
+	__u16 encap_type = up->encap_type;
+
+	/* if this is not encapsulated socket, then just return now */
+	if (!encap_type)
+		return 1;
+
+	/* If this is a paged skb, make sure we pull up
+	 * whatever data we need to look at. */
+	len = skb->len - sizeof(struct udphdr);
+	if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+		return 1;
+
+	/* Now we can get the pointers */
+	uh = udp_hdr(skb);
+	udpdata = (__u8 *)uh + sizeof(struct udphdr);
+	udpdata32 = (__be32 *)udpdata;
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+			/* ESP Packet without Non-ESP header */
+			len = sizeof(struct udphdr);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+			   udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+			/* ESP Packet with Non-IKE marker */
+			len = sizeof(struct udphdr) + 2 * sizeof(u32);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	}
+
+	/* At this point we are sure that this is an ESPinUDP packet,
+	 * so we need to remove 'len' bytes from the packet (the UDP
+	 * header and optional ESP marker bytes) and then modify the
+	 * protocol to ESP, and then call into the transform receiver.
+	 */
+	if (skb_unclone(skb, GFP_ATOMIC))
+		goto drop;
+
+	/* Now we can update and verify the packet length... */
+	ip6h = ipv6_hdr(skb);
+	ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len);
+	if (skb->len < ip6hlen + len) {
+		/* packet is too small!?! */
+		goto drop;
+	}
+
+	/* pull the data buffer up to the ESP header and set the
+	 * transport header to point to ESP.  Keep UDP on the stack
+	 * for later.
+	 */
+	__skb_pull(skb, len);
+	skb_reset_transport_header(skb);
+
+	/* process ESP */
+	return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+	kfree_skb(skb);
+	return 0;
 }
 
 int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index 34cb65c7d5a7..ea2f805d3b01 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -14,6 +14,7 @@
 #include <linux/mutex.h>
 #include <linux/skbuff.h>
 #include <linux/icmpv6.h>
+#include <net/ip6_route.h>
 #include <net/ipv6.h>
 #include <net/protocol.h>
 #include <net/xfrm.h>
@@ -58,6 +59,53 @@ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
 	return 0;
 }
 
+int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+		    int encap_type)
+{
+	int ret;
+	struct xfrm6_protocol *handler;
+	struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr);
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+
+	if (!head)
+		goto out;
+
+	if (!skb_dst(skb)) {
+		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		int flags = RT6_LOOKUP_F_HAS_SADDR;
+		struct dst_entry *dst;
+		struct flowi6 fl6 = {
+			.flowi6_iif   = skb->dev->ifindex,
+			.daddr        = ip6h->daddr,
+			.saddr        = ip6h->saddr,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_mark  = skb->mark,
+			.flowi6_proto = ip6h->nexthdr,
+		};
+
+		dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6,
+					     skb, flags);
+		if (dst->error)
+			goto drop;
+		skb_dst_set(skb, dst);
+	}
+
+	for_each_protocol_rcu(*head, handler)
+		if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+			return ret;
+
+out:
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm6_rcv_encap);
+
 static int xfrm6_esp_rcv(struct sk_buff *skb)
 {
 	int ret;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index eb9928c0a87c..02f8f46d0cc5 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -755,6 +755,7 @@ static struct pernet_operations xfrmi_net_ops = {
 
 static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
 	.handler	=	xfrm6_rcv,
+	.input_handler	=	xfrm_input,
 	.cb_handler	=	xfrmi_rcv_cb,
 	.err_handler	=	xfrmi6_err,
 	.priority	=	10,
@@ -762,6 +763,7 @@ static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
 
 static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
 	.handler	=	xfrm6_rcv,
+	.input_handler	=	xfrm_input,
 	.cb_handler	=	xfrmi_rcv_cb,
 	.err_handler	=	xfrmi6_err,
 	.priority	=	10,
@@ -769,6 +771,7 @@ static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
 
 static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
 	.handler	=	xfrm6_rcv,
+	.input_handler	=	xfrm_input,
 	.cb_handler	=	xfrmi_rcv_cb,
 	.err_handler	=	xfrmi6_err,
 	.priority	=	10,
-- 
cgit v1.2.3


From 26333c37fc285e7372f1b9461f3ae0ba3dc699c9 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 27 Apr 2020 17:59:35 +0200
Subject: xfrm: add IPv6 support for espintcp

This extends espintcp to support IPv6, building on the existing code
and the new UDPv6 encapsulation support. Most of the code is either
reused directly (stream parser, ULP) or very similar to the IPv4
variant (net/ipv6/esp6.c changes).

The separation of config options for IPv4 and IPv6 espintcp requires a
bit of Kconfig gymnastics to enable the core code.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ipv6_stubs.h |   2 +
 net/ipv4/Kconfig         |   1 +
 net/ipv6/Kconfig         |  12 +++
 net/ipv6/af_inet6.c      |   1 +
 net/ipv6/esp6.c          | 188 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/Kconfig         |   3 +
 net/xfrm/Makefile        |   2 +-
 net/xfrm/espintcp.c      |  56 +++++++++++---
 8 files changed, 252 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index f033a17b53b6..1e9e0cf7dc75 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -58,6 +58,8 @@ struct ipv6_stub {
 			      bool router, bool solicited, bool override, bool inc_opt);
 #if IS_ENABLED(CONFIG_XFRM)
 	int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb);
+	int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi,
+			       int encap_type);
 #endif
 	struct neigh_table *nd_tbl;
 };
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 25a8888826b8..014aaa17dc79 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -384,6 +384,7 @@ config INET_ESPINTCP
 	depends on XFRM && INET_ESP
 	select STREAM_PARSER
 	select NET_SOCK_MSG
+	select XFRM_ESPINTCP
 	help
 	  Support for RFC 8229 encapsulation of ESP and IKE over
 	  TCP/IPv4 sockets.
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2ccaee98fddb..468a2faadc7d 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -88,6 +88,18 @@ config INET6_ESP_OFFLOAD
 
 	  If unsure, say N.
 
+config INET6_ESPINTCP
+	bool "IPv6: ESP in TCP encapsulation (RFC 8229)"
+	depends on XFRM && INET6_ESP
+	select STREAM_PARSER
+	select NET_SOCK_MSG
+	select XFRM_ESPINTCP
+	help
+	  Support for RFC 8229 encapsulation of ESP and IKE over
+	  TCP/IPv6 sockets.
+
+	  If unsure, say N.
+
 config INET6_IPCOMP
 	tristate "IPv6: IPComp transformation"
 	select INET6_XFRM_TUNNEL
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b0b99c08350a..cbbb00bad20e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -964,6 +964,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.ndisc_send_na = ndisc_send_na,
 #if IS_ENABLED(CONFIG_XFRM)
 	.xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
+	.xfrm6_rcv_encap = xfrm6_rcv_encap,
 #endif
 	.nd_tbl	= &nd_tbl,
 };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index e8800968e209..c43592771126 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -33,6 +33,9 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 #include <linux/icmpv6.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <net/inet6_hashtables.h>
 
 #include <linux/highmem.h>
 
@@ -132,6 +135,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
 			put_page(sg_page(sg));
 }
 
+#ifdef CONFIG_INET6_ESPINTCP
+struct esp_tcp_sk {
+	struct sock *sk;
+	struct rcu_head rcu;
+};
+
+static void esp_free_tcp_sk(struct rcu_head *head)
+{
+	struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
+
+	sock_put(esk->sk);
+	kfree(esk);
+}
+
+static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
+{
+	struct xfrm_encap_tmpl *encap = x->encap;
+	struct esp_tcp_sk *esk;
+	__be16 sport, dport;
+	struct sock *nsk;
+	struct sock *sk;
+
+	sk = rcu_dereference(x->encap_sk);
+	if (sk && sk->sk_state == TCP_ESTABLISHED)
+		return sk;
+
+	spin_lock_bh(&x->lock);
+	sport = encap->encap_sport;
+	dport = encap->encap_dport;
+	nsk = rcu_dereference_protected(x->encap_sk,
+					lockdep_is_held(&x->lock));
+	if (sk && sk == nsk) {
+		esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
+		if (!esk) {
+			spin_unlock_bh(&x->lock);
+			return ERR_PTR(-ENOMEM);
+		}
+		RCU_INIT_POINTER(x->encap_sk, NULL);
+		esk->sk = sk;
+		call_rcu(&esk->rcu, esp_free_tcp_sk);
+	}
+	spin_unlock_bh(&x->lock);
+
+	sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6,
+					dport, &x->props.saddr.in6, ntohs(sport), 0, 0);
+	if (!sk)
+		return ERR_PTR(-ENOENT);
+
+	if (!tcp_is_ulp_esp(sk)) {
+		sock_put(sk);
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock_bh(&x->lock);
+	nsk = rcu_dereference_protected(x->encap_sk,
+					lockdep_is_held(&x->lock));
+	if (encap->encap_sport != sport ||
+	    encap->encap_dport != dport) {
+		sock_put(sk);
+		sk = nsk ?: ERR_PTR(-EREMCHG);
+	} else if (sk == nsk) {
+		sock_put(sk);
+	} else {
+		rcu_assign_pointer(x->encap_sk, sk);
+	}
+	spin_unlock_bh(&x->lock);
+
+	return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct sock *sk;
+	int err;
+
+	rcu_read_lock();
+
+	sk = esp6_find_tcp_sk(x);
+	err = PTR_ERR_OR_ZERO(sk);
+	if (err)
+		goto out;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		err = espintcp_queue_out(sk, skb);
+	else
+		err = espintcp_push_skb(sk, skb);
+	bh_unlock_sock(sk);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+
+	return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	local_bh_disable();
+	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+	local_bh_enable();
+
+	/* EINPROGRESS just happens to do the right thing.  It
+	 * actually means that the skb has been consumed and
+	 * isn't coming back.
+	 */
+	return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+
+	return -EOPNOTSUPP;
+}
+#endif
+
 static void esp_output_encap_csum(struct sk_buff *skb)
 {
 	/* UDP encap with IPv6 requires a valid checksum */
@@ -181,7 +310,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
 		secpath_reset(skb);
 		xfrm_dev_resume(skb);
 	} else {
-		xfrm_output_resume(skb, err);
+		if (!err &&
+		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+			esp_output_tail_tcp(x, skb);
+		else
+			xfrm_output_resume(skb, err);
 	}
 }
 
@@ -274,6 +407,41 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
 	return (struct ip_esp_hdr *)(uh + 1);
 }
 
+#ifdef CONFIG_INET6_ESPINTCP
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+						struct sk_buff *skb,
+						struct esp_info *esp)
+{
+	__be16 *lenp = (void *)esp->esph;
+	struct ip_esp_hdr *esph;
+	unsigned int len;
+	struct sock *sk;
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+	if (len > IP_MAX_MTU)
+		return ERR_PTR(-EMSGSIZE);
+
+	rcu_read_lock();
+	sk = esp6_find_tcp_sk(x);
+	rcu_read_unlock();
+
+	if (IS_ERR(sk))
+		return ERR_CAST(sk);
+
+	*lenp = htons(len);
+	esph = (struct ip_esp_hdr *)(lenp + 1);
+
+	return esph;
+}
+#else
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+						struct sk_buff *skb,
+						struct esp_info *esp)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
 static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 			    struct esp_info *esp)
 {
@@ -294,6 +462,9 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 	case UDP_ENCAP_ESPINUDP_NON_IKE:
 		esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
 		break;
+	case TCP_ENCAP_ESPINTCP:
+		esph = esp6_output_tcp_encap(x, skb, esp);
+		break;
 	}
 
 	if (IS_ERR(esph))
@@ -509,6 +680,9 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	if (sg != dsg)
 		esp_ssg_unref(x, tmp);
 
+	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+		err = esp_output_tail_tcp(x, skb);
+
 error_free:
 	kfree(tmp);
 error:
@@ -633,9 +807,13 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 		struct xfrm_encap_tmpl *encap = x->encap;
 		struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len);
+		struct tcphdr *th = (void *)(skb_network_header(skb) + hdr_len);
 		__be16 source;
 
 		switch (x->encap->encap_type) {
+		case TCP_ENCAP_ESPINTCP:
+			source = th->source;
+			break;
 		case UDP_ENCAP_ESPINUDP:
 		case UDP_ENCAP_ESPINUDP_NON_IKE:
 			source = uh->source;
@@ -1039,6 +1217,14 @@ static int esp6_init_state(struct xfrm_state *x)
 		case UDP_ENCAP_ESPINUDP_NON_IKE:
 			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
 			break;
+#ifdef CONFIG_INET6_ESPINTCP
+		case TCP_ENCAP_ESPINTCP:
+			/* only the length field, TCP encap is done by
+			 * the socket
+			 */
+			x->props.header_len += 2;
+			break;
+#endif
 		}
 	}
 
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 6921a18201a0..b7fd9c838416 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -99,4 +99,7 @@ config NET_KEY_MIGRATE
 
 	  If unsure, say N.
 
+config XFRM_ESPINTCP
+	bool
+
 endif # INET
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 212a4fcb4a88..2d4bb4b9f75e 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -11,4 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
 obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
-obj-$(CONFIG_INET_ESPINTCP) += espintcp.o
+obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index 037ea156d2f9..2132a3b6df0f 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -6,6 +6,9 @@
 #include <net/espintcp.h>
 #include <linux/skmsg.h>
 #include <net/inet_common.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6_stubs.h>
+#endif
 
 static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb,
 			  struct sock *sk)
@@ -31,7 +34,12 @@ static void handle_esp(struct sk_buff *skb, struct sock *sk)
 	rcu_read_lock();
 	skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
 	local_bh_disable();
-	xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		ipv6_stub->xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP);
+	else
+#endif
+		xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP);
 	local_bh_enable();
 	rcu_read_unlock();
 }
@@ -347,6 +355,9 @@ unlock:
 
 static struct proto espintcp_prot __ro_after_init;
 static struct proto_ops espintcp_ops __ro_after_init;
+static struct proto espintcp6_prot;
+static struct proto_ops espintcp6_ops;
+static DEFINE_MUTEX(tcpv6_prot_mutex);
 
 static void espintcp_data_ready(struct sock *sk)
 {
@@ -384,10 +395,14 @@ static void espintcp_destruct(struct sock *sk)
 
 bool tcp_is_ulp_esp(struct sock *sk)
 {
-	return sk->sk_prot == &espintcp_prot;
+	return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot;
 }
 EXPORT_SYMBOL_GPL(tcp_is_ulp_esp);
 
+static void build_protos(struct proto *espintcp_prot,
+			 struct proto_ops *espintcp_ops,
+			 const struct proto *orig_prot,
+			 const struct proto_ops *orig_ops);
 static int espintcp_init_sk(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -415,8 +430,19 @@ static int espintcp_init_sk(struct sock *sk)
 	strp_check_rcv(&ctx->strp);
 	skb_queue_head_init(&ctx->ike_queue);
 	skb_queue_head_init(&ctx->out_queue);
-	sk->sk_prot = &espintcp_prot;
-	sk->sk_socket->ops = &espintcp_ops;
+
+	if (sk->sk_family == AF_INET) {
+		sk->sk_prot = &espintcp_prot;
+		sk->sk_socket->ops = &espintcp_ops;
+	} else {
+		mutex_lock(&tcpv6_prot_mutex);
+		if (!espintcp6_prot.recvmsg)
+			build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops);
+		mutex_unlock(&tcpv6_prot_mutex);
+
+		sk->sk_prot = &espintcp6_prot;
+		sk->sk_socket->ops = &espintcp6_ops;
+	}
 	ctx->saved_data_ready = sk->sk_data_ready;
 	ctx->saved_write_space = sk->sk_write_space;
 	sk->sk_data_ready = espintcp_data_ready;
@@ -489,6 +515,20 @@ static __poll_t espintcp_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
+static void build_protos(struct proto *espintcp_prot,
+			 struct proto_ops *espintcp_ops,
+			 const struct proto *orig_prot,
+			 const struct proto_ops *orig_ops)
+{
+	memcpy(espintcp_prot, orig_prot, sizeof(struct proto));
+	memcpy(espintcp_ops, orig_ops, sizeof(struct proto_ops));
+	espintcp_prot->sendmsg = espintcp_sendmsg;
+	espintcp_prot->recvmsg = espintcp_recvmsg;
+	espintcp_prot->close = espintcp_close;
+	espintcp_prot->release_cb = espintcp_release;
+	espintcp_ops->poll = espintcp_poll;
+}
+
 static struct tcp_ulp_ops espintcp_ulp __read_mostly = {
 	.name = "espintcp",
 	.owner = THIS_MODULE,
@@ -497,13 +537,7 @@ static struct tcp_ulp_ops espintcp_ulp __read_mostly = {
 
 void __init espintcp_init(void)
 {
-	memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot));
-	memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops));
-	espintcp_prot.sendmsg = espintcp_sendmsg;
-	espintcp_prot.recvmsg = espintcp_recvmsg;
-	espintcp_prot.close = espintcp_close;
-	espintcp_prot.release_cb = espintcp_release;
-	espintcp_ops.poll = espintcp_poll;
+	build_protos(&espintcp_prot, &espintcp_ops, &tcp_prot, &inet_stream_ops);
 
 	tcp_register_ulp(&espintcp_ulp);
 }
-- 
cgit v1.2.3


From 4364f2e91f0d44fa0e233d2a55e3ec35053d9bd9 Mon Sep 17 00:00:00 2001
From: Alain Michaud <alainm@chromium.org>
Date: Thu, 23 Apr 2020 14:43:29 +0000
Subject: Bluetooth: allow scatternet connections if supported.

This change allows scatternet connections to be created if the
controller reports support and the HCI_QUIRK_VALID_LE_STATES indicates
that the reported LE states can be trusted.

Signed-off-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 966fc543c01d..006c24e04b44 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5288,7 +5288,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
 	/* Most controller will fail if we try to create new connections
 	 * while we have an existing one in slave role.
 	 */
-	if (hdev->conn_hash.le_num_slave > 0)
+	if (hdev->conn_hash.le_num_slave > 0 &&
+	    (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) ||
+	     !(hdev->le_states[3] & 0x10)))
 		return NULL;
 
 	/* If we're not connectable only connect devices that we have in
-- 
cgit v1.2.3


From 11dd74b338bf83f8bca70b57bad33a903fedfa6e Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Mon, 27 Apr 2020 13:56:45 -0700
Subject: net: ipv6: new arg skip_notify to ip6_rt_del

Used in subsequent work to skip route delete
notifications on nexthop deletes.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h  |  2 +-
 include/net/ipv6_stubs.h |  2 +-
 net/ipv4/nexthop.c       |  2 +-
 net/ipv6/addrconf.c      | 12 ++++++------
 net/ipv6/addrconf_core.c |  3 ++-
 net/ipv6/anycast.c       |  4 ++--
 net/ipv6/ndisc.c         |  2 +-
 net/ipv6/route.c         | 11 +++++++----
 8 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 9947eb1e9eb6..e525f003e619 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
-int ip6_del_rt(struct net *net, struct fib6_info *f6i);
+int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);
 
 void rt6_flush_exceptions(struct fib6_info *f6i);
 void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 3e7d2c0e79ca..a5f7c12c326a 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -48,7 +48,7 @@ struct ipv6_stub {
 			    struct netlink_ext_ack *extack);
 	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
 	void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt);
-	int (*ip6_del_rt)(struct net *net, struct fib6_info *rt);
+	int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify);
 	void (*fib6_rt_update)(struct net *net, struct fib6_info *rt,
 			       struct nl_info *info);
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index fdfca534d094..9999687ad6dc 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -784,7 +784,7 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 		/* __ip6_del_rt does a release, so do a hold here */
 		fib6_info_hold(f6i);
-		ipv6_stub->ip6_del_rt(net, f6i);
+		ipv6_stub->ip6_del_rt(net, f6i, false);
 	}
 }
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27b4fb6e452b..2c4f20ec1e2a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
 					ifp->idev->dev, 0, RTF_DEFAULT, true);
 	if (f6i) {
 		if (del_rt)
-			ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+			ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
 		else {
 			if (!(f6i->fib6_flags & RTF_EXPIRES))
 				fib6_set_expires(f6i, expires);
@@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 		if (rt) {
 			/* Autoconf prefix route */
 			if (valid_lft == 0) {
-				ip6_del_rt(net, rt);
+				ip6_del_rt(net, rt, false);
 				rt = NULL;
 			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
@@ -3813,7 +3813,7 @@ restart:
 		spin_unlock_bh(&ifa->lock);
 
 		if (rt)
-			ip6_del_rt(net, rt);
+			ip6_del_rt(net, rt, false);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 	prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
 	if (f6i->fib6_metric != prio) {
 		/* delete old one */
-		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
 
 		/* add new one */
 		addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
@@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 						       ifp->idev->dev, 0, 0,
 						       false);
 			if (rt)
-				ip6_del_rt(net, rt);
+				ip6_del_rt(net, rt, false);
 		}
 		if (ifp->rt) {
-			ip6_del_rt(net, ifp->rt);
+			ip6_del_rt(net, ifp->rt, false);
 			ifp->rt = NULL;
 		}
 		rt_genid_bump_ipv6(net);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index ea00ce3d4117..9ebf3fe0d2b1 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 	return -EAFNOSUPPORT;
 }
 
-static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
+				   bool skip_notify)
 {
 	return -EAFNOSUPPORT;
 }
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index fed91ab7ec46..893261230ffc 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	ipv6_del_acaddr_hash(aca);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
-	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+	ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
 
 	aca_put(aca);
 	return 0;
@@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
-		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+		ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
 
 		aca_put(aca);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 1ecd4e9b0bdf..2d09c4da03ee 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 	}
 	if (rt && lifetime == 0) {
-		ip6_del_rt(net, rt);
+		ip6_del_rt(net, rt, false);
 		rt = NULL;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 310cbddaa533..486c36a14f24 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 					gwaddr, dev);
 
 	if (rt && !lifetime) {
-		ip6_del_rt(net, rt);
+		ip6_del_rt(net, rt, false);
 		rt = NULL;
 	}
 
@@ -3729,9 +3729,12 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct net *net, struct fib6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
 {
-	struct nl_info info = { .nl_net = net };
+	struct nl_info info = {
+		.nl_net = net,
+		.skip_notify = skip_notify
+	};
 
 	return __ip6_del_rt(rt, &info);
 }
@@ -4252,7 +4255,7 @@ restart:
 		    (!idev || idev->cnf.accept_ra != 2) &&
 		    fib6_info_hold_safe(rt)) {
 			rcu_read_unlock();
-			ip6_del_rt(net, rt);
+			ip6_del_rt(net, rt, false);
 			goto restart;
 		}
 	}
-- 
cgit v1.2.3


From 4f80116d3df3b23ee4b83ea8557629e1799bc230 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Mon, 27 Apr 2020 13:56:46 -0700
Subject: net: ipv4: add sysctl for nexthop api compatibility mode

Current route nexthop API maintains user space compatibility
with old route API by default. Dumps and netlink notifications
support both new and old API format. In systems which have
moved to the new API, this compatibility mode cancels some
of the performance benefits provided by the new nexthop API.

This patch adds new sysctl nexthop_compat_mode which is on
by default but provides the ability to turn off compatibility
mode allowing systems to run entirely with the new routing
API. Old route API behaviour and support is not modified by this
sysctl.

Uses a single sysctl to cover both ipv4 and ipv6 following
other sysctls. Covers dumps and delete notifications as
suggested by David Ahern.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ++++++++++++
 include/net/netns/ipv4.h               |  2 ++
 net/ipv4/af_inet.c                     |  1 +
 net/ipv4/fib_semantics.c               |  3 +++
 net/ipv4/nexthop.c                     |  5 +++--
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv6/route.c                       |  3 ++-
 7 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9375324aa8e1..5cdc37c34830 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN
 	on userspace caches to track link events and evict routes.
 	Default: false (generate message)
 
+nexthop_compat_mode - BOOLEAN
+	New nexthop API provides a means for managing nexthops independent of
+	prefixes. Backwards compatibilty with old route format is enabled by
+	default which means route dumps and notifications contain the new
+	nexthop attribute but also the full, expanded nexthop definition.
+	Further, updates or deletes of a nexthop configuration generate route
+	notifications for each fib entry using the nexthop. Once a system
+	understands the new API, this sysctl can be disabled to achieve full
+	performance benefits of the new API by disabling the nexthop expansion
+	and extraneous notifications.
+	Default: true (backward compat mode)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 154b8f01499b..5acdb4d414c4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -111,6 +111,8 @@ struct netns_ipv4 {
 	int sysctl_tcp_early_demux;
 	int sysctl_udp_early_demux;
 
+	int sysctl_nexthop_compat_mode;
+
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
 #ifdef CONFIG_NET_L3_MASTER_DEV
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c618e242490f..6177c4ba0037 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_ip_early_demux = 1;
 	net->ipv4.sysctl_udp_early_demux = 1;
 	net->ipv4.sysctl_tcp_early_demux = 1;
+	net->ipv4.sysctl_nexthop_compat_mode = 1;
 #ifdef CONFIG_SYSCTL
 	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 55ca2e521828..e53871e4a097 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 		if (nexthop_is_blackhole(fi->nh))
 			rtm->rtm_type = RTN_BLACKHOLE;
+		if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
+			goto offload;
 	}
 
 	if (nhs == 1) {
@@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 	}
 
+offload:
 	if (fri->offload)
 		rtm->rtm_flags |= RTM_F_OFFLOAD;
 	if (fri->trap)
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 9999687ad6dc..3957364d556c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 		/* __ip6_del_rt does a release, so do a hold here */
 		fib6_info_hold(f6i);
-		ipv6_stub->ip6_del_rt(net, f6i, false);
+		ipv6_stub->ip6_del_rt(net, f6i,
+				      !net->ipv4.sysctl_nexthop_compat_mode);
 	}
 }
 
@@ -1041,7 +1042,7 @@ out:
 	if (!rc) {
 		nh_base_seq_inc(net);
 		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
-		if (replace_notify)
+		if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
 			nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
 	}
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 81b267e990a1..95ad71e76cc3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_tcp_early_demux
 	},
+	{
+		.procname       = "nexthop_compat_mode",
+		.data           = &init_net.ipv4.sysctl_nexthop_compat_mode,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname	= "ip_default_ttl",
 		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 486c36a14f24..803212aae4ca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5557,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 		if (nexthop_is_blackhole(rt->nh))
 			rtm->rtm_type = RTN_BLACKHOLE;
 
-		if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+		if (net->ipv4.sysctl_nexthop_compat_mode &&
+		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
 			goto nla_put_failure;
 
 		rtm->rtm_flags |= nh_flags;
-- 
cgit v1.2.3


From ff2269f16a1e1a7f8bbe72920d3d285ba3943572 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:21 +0200
Subject: docs: networking: convert atm.txt to ReST

There isn't much to be done here. Just:

- add SPDX header;
- add a document title.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/atm.rst   | 14 ++++++++++++++
 Documentation/networking/atm.txt   |  8 --------
 Documentation/networking/index.rst |  1 +
 net/atm/Kconfig                    |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/networking/atm.rst
 delete mode 100644 Documentation/networking/atm.txt

(limited to 'net')

diff --git a/Documentation/networking/atm.rst b/Documentation/networking/atm.rst
new file mode 100644
index 000000000000..c1df8c038525
--- /dev/null
+++ b/Documentation/networking/atm.rst
@@ -0,0 +1,14 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+ATM
+===
+
+In order to use anything but the most primitive functions of ATM,
+several user-mode programs are required to assist the kernel. These
+programs and related material can be found via the ATM on Linux Web
+page at http://linux-atm.sourceforge.net/
+
+If you encounter problems with ATM, please report them on the ATM
+on Linux mailing list. Subscription information, archives, etc.,
+can be found on http://linux-atm.sourceforge.net/
diff --git a/Documentation/networking/atm.txt b/Documentation/networking/atm.txt
deleted file mode 100644
index 82921cee77fe..000000000000
--- a/Documentation/networking/atm.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-In order to use anything but the most primitive functions of ATM,
-several user-mode programs are required to assist the kernel. These
-programs and related material can be found via the ATM on Linux Web
-page at http://linux-atm.sourceforge.net/
-
-If you encounter problems with ATM, please report them on the ATM
-on Linux mailing list. Subscription information, archives, etc.,
-can be found on http://linux-atm.sourceforge.net/
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 3e0a4bb23ef9..841f3c3905d5 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -41,6 +41,7 @@ Contents:
    altera_tse
    arcnet-hardware
    arcnet
+   atm
 
 .. only::  subproject and html
 
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
index 271f682e8438..e61dcc9f85b2 100644
--- a/net/atm/Kconfig
+++ b/net/atm/Kconfig
@@ -16,7 +16,7 @@ config ATM
 	  of your ATM card below.
 
 	  Note that you need a set of user-space programs to actually make use
-	  of ATM.  See the file <file:Documentation/networking/atm.txt> for
+	  of ATM.  See the file <file:Documentation/networking/atm.rst> for
 	  further details.
 
 config ATM_CLIP
-- 
cgit v1.2.3


From 20b943f075574c233de51fa2f0124a97f0298be1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:22 +0200
Subject: docs: networking: convert ax25.txt to ReST

There isn't much to be done here. Just:

- add SPDX header;
- add a document title.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ax25.rst  | 16 ++++++++++++++++
 Documentation/networking/ax25.txt  | 10 ----------
 Documentation/networking/index.rst |  1 +
 net/ax25/Kconfig                   |  6 +++---
 4 files changed, 20 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/networking/ax25.rst
 delete mode 100644 Documentation/networking/ax25.txt

(limited to 'net')

diff --git a/Documentation/networking/ax25.rst b/Documentation/networking/ax25.rst
new file mode 100644
index 000000000000..824afd7002db
--- /dev/null
+++ b/Documentation/networking/ax25.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====
+AX.25
+=====
+
+To use the amateur radio protocols within Linux you will need to get a
+suitable copy of the AX.25 Utilities. More detailed information about
+AX.25, NET/ROM and ROSE, associated programs and and utilities can be
+found on http://www.linux-ax25.org.
+
+There is an active mailing list for discussing Linux amateur radio matters
+called linux-hams@vger.kernel.org. To subscribe to it, send a message to
+majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body
+of the message, the subject field is ignored.  You don't need to be
+subscribed to post but of course that means you might miss an answer.
diff --git a/Documentation/networking/ax25.txt b/Documentation/networking/ax25.txt
deleted file mode 100644
index 8257dbf9be57..000000000000
--- a/Documentation/networking/ax25.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-To use the amateur radio protocols within Linux you will need to get a
-suitable copy of the AX.25 Utilities. More detailed information about
-AX.25, NET/ROM and ROSE, associated programs and and utilities can be
-found on http://www.linux-ax25.org.
-
-There is an active mailing list for discussing Linux amateur radio matters
-called linux-hams@vger.kernel.org. To subscribe to it, send a message to
-majordomo@vger.kernel.org with the words "subscribe linux-hams" in the body
-of the message, the subject field is ignored.  You don't need to be
-subscribed to post but of course that means you might miss an answer.
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 841f3c3905d5..6a5858b27cf6 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -42,6 +42,7 @@ Contents:
    arcnet-hardware
    arcnet
    atm
+   ax25
 
 .. only::  subproject and html
 
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index 043fd5437809..97d686d115c0 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -40,7 +40,7 @@ config AX25
 	  radio as well as information about how to configure an AX.25 port is
 	  contained in the AX25-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>. You might also want to
-	  check out the file <file:Documentation/networking/ax25.txt> in the
+	  check out the file <file:Documentation/networking/ax25.rst> in the
 	  kernel source. More information about digital amateur radio in
 	  general is on the WWW at
 	  <http://www.tapr.org/>.
@@ -88,7 +88,7 @@ config NETROM
 	  users as well as information about how to configure an AX.25 port is
 	  contained in the Linux Ham Wiki, available from
 	  <http://www.linux-ax25.org>. You also might want to check out the
-	  file <file:Documentation/networking/ax25.txt>. More information about
+	  file <file:Documentation/networking/ax25.rst>. More information about
 	  digital amateur radio in general is on the WWW at
 	  <http://www.tapr.org/>.
 
@@ -107,7 +107,7 @@ config ROSE
 	  users as well as information about how to configure an AX.25 port is
 	  contained in the Linux Ham Wiki, available from
 	  <http://www.linux-ax25.org>.  You also might want to check out the
-	  file <file:Documentation/networking/ax25.txt>. More information about
+	  file <file:Documentation/networking/ax25.rst>. More information about
 	  digital amateur radio in general is on the WWW at
 	  <http://www.tapr.org/>.
 
-- 
cgit v1.2.3


From 9a69fb9c21c4bf4107becb877729544759bdd059 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:30 +0200
Subject: docs: networking: convert decnet.txt to ReST

- add SPDX header;
- adjust titles and chapters, adding proper markups;
- mark lists as such;
- mark code blocks and literals as such;
- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/kernel-parameters.txt |   2 +-
 Documentation/networking/decnet.rst             | 243 ++++++++++++++++++++++++
 Documentation/networking/decnet.txt             | 230 ----------------------
 Documentation/networking/index.rst              |   1 +
 MAINTAINERS                                     |   2 +-
 net/decnet/Kconfig                              |   4 +-
 6 files changed, 248 insertions(+), 234 deletions(-)
 create mode 100644 Documentation/networking/decnet.rst
 delete mode 100644 Documentation/networking/decnet.txt

(limited to 'net')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2a93c8679e8..b23ab11587a6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -831,7 +831,7 @@
 
 	decnet.addr=	[HW,NET]
 			Format: <area>[,<node>]
-			See also Documentation/networking/decnet.txt.
+			See also Documentation/networking/decnet.rst.
 
 	default_hugepagesz=
 			[same as hugepagesz=] The size of the default
diff --git a/Documentation/networking/decnet.rst b/Documentation/networking/decnet.rst
new file mode 100644
index 000000000000..b8bc11ff8370
--- /dev/null
+++ b/Documentation/networking/decnet.rst
@@ -0,0 +1,243 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+Linux DECnet Networking Layer Information
+=========================================
+
+1. Other documentation....
+==========================
+
+   - Project Home Pages
+     - http://www.chygwyn.com/				   - Kernel info
+     - http://linux-decnet.sourceforge.net/                - Userland tools
+     - http://www.sourceforge.net/projects/linux-decnet/   - Status page
+
+2. Configuring the kernel
+=========================
+
+Be sure to turn on the following options:
+
+    - CONFIG_DECNET (obviously)
+    - CONFIG_PROC_FS (to see what's going on)
+    - CONFIG_SYSCTL (for easy configuration)
+
+if you want to try out router support (not properly debugged yet)
+you'll need the following options as well...
+
+    - CONFIG_DECNET_ROUTER (to be able to add/delete routes)
+    - CONFIG_NETFILTER (will be required for the DECnet routing daemon)
+
+Don't turn on SIOCGIFCONF support for DECnet unless you are really sure
+that you need it, in general you won't and it can cause ifconfig to
+malfunction.
+
+Run time configuration has changed slightly from the 2.4 system. If you
+want to configure an endnode, then the simplified procedure is as follows:
+
+ - Set the MAC address on your ethernet card before starting _any_ other
+   network protocols.
+
+As soon as your network card is brought into the UP state, DECnet should
+start working. If you need something more complicated or are unsure how
+to set the MAC address, see the next section. Also all configurations which
+worked with 2.4 will work under 2.5 with no change.
+
+3. Command line options
+=======================
+
+You can set a DECnet address on the kernel command line for compatibility
+with the 2.4 configuration procedure, but in general it's not needed any more.
+If you do st a DECnet address on the command line, it has only one purpose
+which is that its added to the addresses on the loopback device.
+
+With 2.4 kernels, DECnet would only recognise addresses as local if they
+were added to the loopback device. In 2.5, any local interface address
+can be used to loop back to the local machine. Of course this does not
+prevent you adding further addresses to the loopback device if you
+want to.
+
+N.B. Since the address list of an interface determines the addresses for
+which "hello" messages are sent, if you don't set an address on the loopback
+interface then you won't see any entries in /proc/net/neigh for the local
+host until such time as you start a connection. This doesn't affect the
+operation of the local communications in any other way though.
+
+The kernel command line takes options looking like the following::
+
+    decnet.addr=1,2
+
+the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
+and early 2.3.xx kernels, you must use a comma when specifying the
+DECnet address like this. For more recent 2.3.xx kernels, you may
+use almost any character except space, although a `.` would be the most
+obvious choice :-)
+
+There used to be a third number specifying the node type. This option
+has gone away in favour of a per interface node type. This is now set
+using /proc/sys/net/decnet/conf/<dev>/forwarding. This file can be
+set with a single digit, 0=EndNode, 1=L1 Router and  2=L2 Router.
+
+There are also equivalent options for modules. The node address can
+also be set through the /proc/sys/net/decnet/ files, as can other system
+parameters.
+
+Currently the only supported devices are ethernet and ip_gre. The
+ethernet address of your ethernet card has to be set according to the DECnet
+address of the node in order for it to be autoconfigured (and then appear in
+/proc/net/decnet_dev). There is a utility available at the above
+FTP sites called dn2ethaddr which can compute the correct ethernet
+address to use. The address can be set by ifconfig either before or
+at the time the device is brought up. If you are using RedHat you can
+add the line::
+
+    MACADDR=AA:00:04:00:03:04
+
+or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or
+wherever your network card's configuration lives. Setting the MAC address
+of your ethernet card to an address starting with "hi-ord" will cause a
+DECnet address which matches to be added to the interface (which you can
+verify with iproute2).
+
+The default device for routing can be set through the /proc filesystem
+by setting /proc/sys/net/decnet/default_device to the
+device you want DECnet to route packets out of when no specific route
+is available. Usually this will be eth0, for example::
+
+    echo -n "eth0" >/proc/sys/net/decnet/default_device
+
+If you don't set the default device, then it will default to the first
+ethernet card which has been autoconfigured as described above. You can
+confirm that by looking in the default_device file of course.
+
+There is a list of what the other files under /proc/sys/net/decnet/ do
+on the kernel patch web site (shown above).
+
+4. Run time kernel configuration
+================================
+
+
+This is either done through the sysctl/proc interface (see the kernel web
+pages for details on what the various options do) or through the iproute2
+package in the same way as IPv4/6 configuration is performed.
+
+Documentation for iproute2 is included with the package, although there is
+as yet no specific section on DECnet, most of the features apply to both
+IP and DECnet, albeit with DECnet addresses instead of IP addresses and
+a reduced functionality.
+
+If you want to configure a DECnet router you'll need the iproute2 package
+since its the _only_ way to add and delete routes currently. Eventually
+there will be a routing daemon to send and receive routing messages for
+each interface and update the kernel routing tables accordingly. The
+routing daemon will use netfilter to listen to routing packets, and
+rtnetlink to update the kernels routing tables.
+
+The DECnet raw socket layer has been removed since it was there purely
+for use by the routing daemon which will now use netfilter (a much cleaner
+and more generic solution) instead.
+
+5. How can I tell if its working?
+=================================
+
+Here is a quick guide of what to look for in order to know if your DECnet
+kernel subsystem is working.
+
+   - Is the node address set (see /proc/sys/net/decnet/node_address)
+   - Is the node of the correct type
+     (see /proc/sys/net/decnet/conf/<dev>/forwarding)
+   - Is the Ethernet MAC address of each Ethernet card set to match
+     the DECnet address. If in doubt use the dn2ethaddr utility available
+     at the ftp archive.
+   - If the previous two steps are satisfied, and the Ethernet card is up,
+     you should find that it is listed in /proc/net/decnet_dev and also
+     that it appears as a directory in /proc/sys/net/decnet/conf/. The
+     loopback device (lo) should also appear and is required to communicate
+     within a node.
+   - If you have any DECnet routers on your network, they should appear
+     in /proc/net/decnet_neigh, otherwise this file will only contain the
+     entry for the node itself (if it doesn't check to see if lo is up).
+   - If you want to send to any node which is not listed in the
+     /proc/net/decnet_neigh file, you'll need to set the default device
+     to point to an Ethernet card with connection to a router. This is
+     again done with the /proc/sys/net/decnet/default_device file.
+   - Try starting a simple server and client, like the dnping/dnmirror
+     over the loopback interface. With luck they should communicate.
+     For this step and those after, you'll need the DECnet library
+     which can be obtained from the above ftp sites as well as the
+     actual utilities themselves.
+   - If this seems to work, then try talking to a node on your local
+     network, and see if you can obtain the same results.
+   - At this point you are on your own... :-)
+
+6. How to send a bug report
+===========================
+
+If you've found a bug and want to report it, then there are several things
+you can do to help me work out exactly what it is that is wrong. Useful
+information (_most_ of which _is_ _essential_) includes:
+
+ - What kernel version are you running ?
+ - What version of the patch are you running ?
+ - How far though the above set of tests can you get ?
+ - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ?
+ - Which services are you running ?
+ - Which client caused the problem ?
+ - How much data was being transferred ?
+ - Was the network congested ?
+ - How can the problem be reproduced ?
+ - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of
+   tcpdump don't understand how to dump DECnet properly, so including
+   the hex listing of the packet contents is _essential_, usually the -x flag.
+   You may also need to increase the length grabbed with the -s flag. The
+   -e flag also provides very useful information (ethernet MAC addresses))
+
+7. MAC FAQ
+==========
+
+A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet
+interact and how to get the best performance from your hardware.
+
+Ethernet cards are designed to normally only pass received network frames
+to a host computer when they are addressed to it, or to the broadcast address.
+
+Linux has an interface which allows the setting of extra addresses for
+an ethernet card to listen to. If the ethernet card supports it, the
+filtering operation will be done in hardware, if not the extra unwanted packets
+received will be discarded by the host computer. In the latter case,
+significant processor time and bus bandwidth can be used up on a busy
+network (see the NAPI documentation for a longer explanation of these
+effects).
+
+DECnet makes use of this interface to allow running DECnet on an ethernet
+card which has already been configured using TCP/IP (presumably using the
+built in MAC address of the card, as usual) and/or to allow multiple DECnet
+addresses on each physical interface. If you do this, be aware that if your
+ethernet card doesn't support perfect hashing in its MAC address filter
+then your computer will be doing more work than required. Some cards
+will simply set themselves into promiscuous mode in order to receive
+packets from the DECnet specified addresses. So if you have one of these
+cards its better to set the MAC address of the card as described above
+to gain the best efficiency. Better still is to use a card which supports
+NAPI as well.
+
+
+8. Mailing list
+===============
+
+If you are keen to get involved in development, or want to ask questions
+about configuration, or even just report bugs, then there is a mailing
+list that you can join, details are at:
+
+http://sourceforge.net/mail/?group_id=4993
+
+9. Legal Info
+=============
+
+The Linux DECnet project team have placed their code under the GPL. The
+software is provided "as is" and without warranty express or implied.
+DECnet is a trademark of Compaq. This software is not a product of
+Compaq. We acknowledge the help of people at Compaq in providing extra
+documentation above and beyond what was previously publicly available.
+
+Steve Whitehouse <SteveW@ACM.org>
+
diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt
deleted file mode 100644
index d192f8b9948b..000000000000
--- a/Documentation/networking/decnet.txt
+++ /dev/null
@@ -1,230 +0,0 @@
-                    Linux DECnet Networking Layer Information
-                   ===========================================
-
-1) Other documentation....
-
-   o Project Home Pages
-       http://www.chygwyn.com/                      	    - Kernel info
-       http://linux-decnet.sourceforge.net/                - Userland tools
-       http://www.sourceforge.net/projects/linux-decnet/   - Status page
-
-2) Configuring the kernel
-
-Be sure to turn on the following options:
-
-    CONFIG_DECNET (obviously)
-    CONFIG_PROC_FS (to see what's going on)
-    CONFIG_SYSCTL (for easy configuration)
-
-if you want to try out router support (not properly debugged yet)
-you'll need the following options as well...
-
-    CONFIG_DECNET_ROUTER (to be able to add/delete routes)
-    CONFIG_NETFILTER (will be required for the DECnet routing daemon)
-
-Don't turn on SIOCGIFCONF support for DECnet unless you are really sure
-that you need it, in general you won't and it can cause ifconfig to
-malfunction.
-
-Run time configuration has changed slightly from the 2.4 system. If you
-want to configure an endnode, then the simplified procedure is as follows:
-
- o Set the MAC address on your ethernet card before starting _any_ other
-   network protocols.
-
-As soon as your network card is brought into the UP state, DECnet should
-start working. If you need something more complicated or are unsure how
-to set the MAC address, see the next section. Also all configurations which
-worked with 2.4 will work under 2.5 with no change.
-
-3) Command line options
-
-You can set a DECnet address on the kernel command line for compatibility
-with the 2.4 configuration procedure, but in general it's not needed any more.
-If you do st a DECnet address on the command line, it has only one purpose
-which is that its added to the addresses on the loopback device.
-
-With 2.4 kernels, DECnet would only recognise addresses as local if they
-were added to the loopback device. In 2.5, any local interface address
-can be used to loop back to the local machine. Of course this does not
-prevent you adding further addresses to the loopback device if you
-want to.
-
-N.B. Since the address list of an interface determines the addresses for
-which "hello" messages are sent, if you don't set an address on the loopback
-interface then you won't see any entries in /proc/net/neigh for the local
-host until such time as you start a connection. This doesn't affect the
-operation of the local communications in any other way though.
-
-The kernel command line takes options looking like the following:
-
-    decnet.addr=1,2
-
-the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
-and early 2.3.xx kernels, you must use a comma when specifying the
-DECnet address like this. For more recent 2.3.xx kernels, you may
-use almost any character except space, although a `.` would be the most
-obvious choice :-)
-
-There used to be a third number specifying the node type. This option
-has gone away in favour of a per interface node type. This is now set
-using /proc/sys/net/decnet/conf/<dev>/forwarding. This file can be
-set with a single digit, 0=EndNode, 1=L1 Router and  2=L2 Router.
-
-There are also equivalent options for modules. The node address can
-also be set through the /proc/sys/net/decnet/ files, as can other system
-parameters.
-
-Currently the only supported devices are ethernet and ip_gre. The
-ethernet address of your ethernet card has to be set according to the DECnet
-address of the node in order for it to be autoconfigured (and then appear in
-/proc/net/decnet_dev). There is a utility available at the above
-FTP sites called dn2ethaddr which can compute the correct ethernet
-address to use. The address can be set by ifconfig either before or
-at the time the device is brought up. If you are using RedHat you can
-add the line:
-
-    MACADDR=AA:00:04:00:03:04
-
-or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or
-wherever your network card's configuration lives. Setting the MAC address
-of your ethernet card to an address starting with "hi-ord" will cause a
-DECnet address which matches to be added to the interface (which you can
-verify with iproute2).
-
-The default device for routing can be set through the /proc filesystem
-by setting /proc/sys/net/decnet/default_device to the
-device you want DECnet to route packets out of when no specific route
-is available. Usually this will be eth0, for example:
-
-    echo -n "eth0" >/proc/sys/net/decnet/default_device
-
-If you don't set the default device, then it will default to the first
-ethernet card which has been autoconfigured as described above. You can
-confirm that by looking in the default_device file of course.
-
-There is a list of what the other files under /proc/sys/net/decnet/ do
-on the kernel patch web site (shown above).
-
-4) Run time kernel configuration
-
-This is either done through the sysctl/proc interface (see the kernel web
-pages for details on what the various options do) or through the iproute2
-package in the same way as IPv4/6 configuration is performed.
-
-Documentation for iproute2 is included with the package, although there is
-as yet no specific section on DECnet, most of the features apply to both
-IP and DECnet, albeit with DECnet addresses instead of IP addresses and
-a reduced functionality.
-
-If you want to configure a DECnet router you'll need the iproute2 package
-since its the _only_ way to add and delete routes currently. Eventually
-there will be a routing daemon to send and receive routing messages for
-each interface and update the kernel routing tables accordingly. The
-routing daemon will use netfilter to listen to routing packets, and
-rtnetlink to update the kernels routing tables. 
-
-The DECnet raw socket layer has been removed since it was there purely
-for use by the routing daemon which will now use netfilter (a much cleaner
-and more generic solution) instead.
-
-5) How can I tell if its working ?
-
-Here is a quick guide of what to look for in order to know if your DECnet
-kernel subsystem is working.
-
-   - Is the node address set (see /proc/sys/net/decnet/node_address)
-   - Is the node of the correct type 
-                             (see /proc/sys/net/decnet/conf/<dev>/forwarding)
-   - Is the Ethernet MAC address of each Ethernet card set to match
-     the DECnet address. If in doubt use the dn2ethaddr utility available
-     at the ftp archive.
-   - If the previous two steps are satisfied, and the Ethernet card is up,
-     you should find that it is listed in /proc/net/decnet_dev and also
-     that it appears as a directory in /proc/sys/net/decnet/conf/. The
-     loopback device (lo) should also appear and is required to communicate
-     within a node.
-   - If you have any DECnet routers on your network, they should appear
-     in /proc/net/decnet_neigh, otherwise this file will only contain the
-     entry for the node itself (if it doesn't check to see if lo is up).
-   - If you want to send to any node which is not listed in the
-     /proc/net/decnet_neigh file, you'll need to set the default device
-     to point to an Ethernet card with connection to a router. This is
-     again done with the /proc/sys/net/decnet/default_device file.
-   - Try starting a simple server and client, like the dnping/dnmirror
-     over the loopback interface. With luck they should communicate.
-     For this step and those after, you'll need the DECnet library
-     which can be obtained from the above ftp sites as well as the
-     actual utilities themselves.
-   - If this seems to work, then try talking to a node on your local
-     network, and see if you can obtain the same results.
-   - At this point you are on your own... :-)
-
-6) How to send a bug report
-
-If you've found a bug and want to report it, then there are several things
-you can do to help me work out exactly what it is that is wrong. Useful
-information (_most_ of which _is_ _essential_) includes:
-
- - What kernel version are you running ?
- - What version of the patch are you running ?
- - How far though the above set of tests can you get ?
- - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ?
- - Which services are you running ?
- - Which client caused the problem ?
- - How much data was being transferred ?
- - Was the network congested ?
- - How can the problem be reproduced ?
- - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of 
-   tcpdump don't understand how to dump DECnet properly, so including
-   the hex listing of the packet contents is _essential_, usually the -x flag.
-   You may also need to increase the length grabbed with the -s flag. The
-   -e flag also provides very useful information (ethernet MAC addresses))
-
-7) MAC FAQ
-
-A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet
-interact and how to get the best performance from your hardware. 
-
-Ethernet cards are designed to normally only pass received network frames 
-to a host computer when they are addressed to it, or to the broadcast address.
-
-Linux has an interface which allows the setting of extra addresses for
-an ethernet card to listen to. If the ethernet card supports it, the
-filtering operation will be done in hardware, if not the extra unwanted packets
-received will be discarded by the host computer. In the latter case,
-significant processor time and bus bandwidth can be used up on a busy
-network (see the NAPI documentation for a longer explanation of these
-effects).
-
-DECnet makes use of this interface to allow running DECnet on an ethernet 
-card which has already been configured using TCP/IP (presumably using the 
-built in MAC address of the card, as usual) and/or to allow multiple DECnet
-addresses on each physical interface. If you do this, be aware that if your
-ethernet card doesn't support perfect hashing in its MAC address filter
-then your computer will be doing more work than required. Some cards
-will simply set themselves into promiscuous mode in order to receive
-packets from the DECnet specified addresses. So if you have one of these
-cards its better to set the MAC address of the card as described above
-to gain the best efficiency. Better still is to use a card which supports
-NAPI as well.
-
-
-8) Mailing list
-
-If you are keen to get involved in development, or want to ask questions
-about configuration, or even just report bugs, then there is a mailing
-list that you can join, details are at:
-
-http://sourceforge.net/mail/?group_id=4993
-
-9) Legal Info
-
-The Linux DECnet project team have placed their code under the GPL. The
-software is provided "as is" and without warranty express or implied.
-DECnet is a trademark of Compaq. This software is not a product of
-Compaq. We acknowledge the help of people at Compaq in providing extra
-documentation above and beyond what was previously publicly available.
-
-Steve Whitehouse <SteveW@ACM.org>
-
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 9e83d3bda4e0..e17432492745 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -50,6 +50,7 @@ Contents:
    cxacru
    dccp
    dctcp
+   decnet
 
 .. only::  subproject and html
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 453fe0713e68..7323bfc1720f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4728,7 +4728,7 @@ DECnet NETWORK LAYER
 L:	linux-decnet-user@lists.sourceforge.net
 S:	Orphan
 W:	http://linux-decnet.sourceforge.net
-F:	Documentation/networking/decnet.txt
+F:	Documentation/networking/decnet.rst
 F:	net/decnet/
 
 DECSTATION PLATFORM SUPPORT
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 0935453ccfd5..8f98fb2f2ec9 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -15,7 +15,7 @@ config DECNET
 	  <http://linux-decnet.sourceforge.net/>.
 
 	  More detailed documentation is available in
-	  <file:Documentation/networking/decnet.txt>.
+	  <file:Documentation/networking/decnet.rst>.
 
 	  Be sure to say Y to "/proc file system support" and "Sysctl support"
 	  below when using DECnet, since you will need sysctl support to aid
@@ -40,4 +40,4 @@ config DECNET_ROUTER
 	  filtering" option will be required for the forthcoming routing daemon
 	  to work.
 
-	  See <file:Documentation/networking/decnet.txt> for more information.
+	  See <file:Documentation/networking/decnet.rst> for more information.
-- 
cgit v1.2.3


From 9dfe1361261be48c92fd7cb26909cbcd5d496220 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:32 +0200
Subject: docs: networking: convert dns_resolver.txt to ReST

- add SPDX header;
- adjust titles and chapters, adding proper markups;
- comment out text-only TOC from html/pdf output;

- mark code blocks and literals as such;

- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dns_resolver.rst | 155 +++++++++++++++++++++++++++++
 Documentation/networking/dns_resolver.txt | 157 ------------------------------
 Documentation/networking/index.rst        |   1 +
 net/ceph/Kconfig                          |   2 +-
 net/dns_resolver/Kconfig                  |   2 +-
 net/dns_resolver/dns_key.c                |   2 +-
 net/dns_resolver/dns_query.c              |   2 +-
 7 files changed, 160 insertions(+), 161 deletions(-)
 create mode 100644 Documentation/networking/dns_resolver.rst
 delete mode 100644 Documentation/networking/dns_resolver.txt

(limited to 'net')

diff --git a/Documentation/networking/dns_resolver.rst b/Documentation/networking/dns_resolver.rst
new file mode 100644
index 000000000000..add4d59a99a5
--- /dev/null
+++ b/Documentation/networking/dns_resolver.rst
@@ -0,0 +1,155 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+DNS Resolver Module
+===================
+
+.. Contents:
+
+ - Overview.
+ - Compilation.
+ - Setting up.
+ - Usage.
+ - Mechanism.
+ - Debugging.
+
+
+Overview
+========
+
+The DNS resolver module provides a way for kernel services to make DNS queries
+by way of requesting a key of key type dns_resolver.  These queries are
+upcalled to userspace through /sbin/request-key.
+
+These routines must be supported by userspace tools dns.upcall, cifs.upcall and
+request-key.  It is under development and does not yet provide the full feature
+set.  The features it does support include:
+
+ (*) Implements the dns_resolver key_type to contact userspace.
+
+It does not yet support the following AFS features:
+
+ (*) Dns query support for AFSDB resource record.
+
+This code is extracted from the CIFS filesystem.
+
+
+Compilation
+===========
+
+The module should be enabled by turning on the kernel configuration options::
+
+	CONFIG_DNS_RESOLVER	- tristate "DNS Resolver support"
+
+
+Setting up
+==========
+
+To set up this facility, the /etc/request-key.conf file must be altered so that
+/sbin/request-key can appropriately direct the upcalls.  For example, to handle
+basic dname to IPv4/IPv6 address resolution, the following line should be
+added::
+
+
+	#OP	TYPE		DESC	CO-INFO	PROGRAM ARG1 ARG2 ARG3 ...
+	#======	============	=======	=======	==========================
+	create	dns_resolver  	*	*	/usr/sbin/cifs.upcall %k
+
+To direct a query for query type 'foo', a line of the following should be added
+before the more general line given above as the first match is the one taken::
+
+	create	dns_resolver  	foo:*	*	/usr/sbin/dns.foo %k
+
+
+Usage
+=====
+
+To make use of this facility, one of the following functions that are
+implemented in the module can be called after doing::
+
+	#include <linux/dns_resolver.h>
+
+     ::
+
+	int dns_query(const char *type, const char *name, size_t namelen,
+		     const char *options, char **_result, time_t *_expiry);
+
+     This is the basic access function.  It looks for a cached DNS query and if
+     it doesn't find it, it upcalls to userspace to make a new DNS query, which
+     may then be cached.  The key description is constructed as a string of the
+     form::
+
+		[<type>:]<name>
+
+     where <type> optionally specifies the particular upcall program to invoke,
+     and thus the type of query to do, and <name> specifies the string to be
+     looked up.  The default query type is a straight hostname to IP address
+     set lookup.
+
+     The name parameter is not required to be a NUL-terminated string, and its
+     length should be given by the namelen argument.
+
+     The options parameter may be NULL or it may be a set of options
+     appropriate to the query type.
+
+     The return value is a string appropriate to the query type.  For instance,
+     for the default query type it is just a list of comma-separated IPv4 and
+     IPv6 addresses.  The caller must free the result.
+
+     The length of the result string is returned on success, and a negative
+     error code is returned otherwise.  -EKEYREJECTED will be returned if the
+     DNS lookup failed.
+
+     If _expiry is non-NULL, the expiry time (TTL) of the result will be
+     returned also.
+
+The kernel maintains an internal keyring in which it caches looked up keys.
+This can be cleared by any process that has the CAP_SYS_ADMIN capability by
+the use of KEYCTL_KEYRING_CLEAR on the keyring ID.
+
+
+Reading DNS Keys from Userspace
+===============================
+
+Keys of dns_resolver type can be read from userspace using keyctl_read() or
+"keyctl read/print/pipe".
+
+
+Mechanism
+=========
+
+The dnsresolver module registers a key type called "dns_resolver".  Keys of
+this type are used to transport and cache DNS lookup results from userspace.
+
+When dns_query() is invoked, it calls request_key() to search the local
+keyrings for a cached DNS result.  If that fails to find one, it upcalls to
+userspace to get a new result.
+
+Upcalls to userspace are made through the request_key() upcall vector, and are
+directed by means of configuration lines in /etc/request-key.conf that tell
+/sbin/request-key what program to run to instantiate the key.
+
+The upcall handler program is responsible for querying the DNS, processing the
+result into a form suitable for passing to the keyctl_instantiate_key()
+routine.  This then passes the data to dns_resolver_instantiate() which strips
+off and processes any options included in the data, and then attaches the
+remainder of the string to the key as its payload.
+
+The upcall handler program should set the expiry time on the key to that of the
+lowest TTL of all the records it has extracted a result from.  This means that
+the key will be discarded and recreated when the data it holds has expired.
+
+dns_query() returns a copy of the value attached to the key, or an error if
+that is indicated instead.
+
+See <file:Documentation/security/keys/request-key.rst> for further
+information about request-key function.
+
+
+Debugging
+=========
+
+Debugging messages can be turned on dynamically by writing a 1 into the
+following file::
+
+	/sys/module/dnsresolver/parameters/debug
diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt
deleted file mode 100644
index eaa8f9a6fd5d..000000000000
--- a/Documentation/networking/dns_resolver.txt
+++ /dev/null
@@ -1,157 +0,0 @@
-			     ===================
-			     DNS Resolver Module
-			     ===================
-
-Contents:
-
- - Overview.
- - Compilation.
- - Setting up.
- - Usage.
- - Mechanism.
- - Debugging.
-
-
-========
-OVERVIEW
-========
-
-The DNS resolver module provides a way for kernel services to make DNS queries
-by way of requesting a key of key type dns_resolver.  These queries are
-upcalled to userspace through /sbin/request-key.
-
-These routines must be supported by userspace tools dns.upcall, cifs.upcall and
-request-key.  It is under development and does not yet provide the full feature
-set.  The features it does support include:
-
- (*) Implements the dns_resolver key_type to contact userspace.
-
-It does not yet support the following AFS features:
-
- (*) Dns query support for AFSDB resource record.
-
-This code is extracted from the CIFS filesystem.
-
-
-===========
-COMPILATION
-===========
-
-The module should be enabled by turning on the kernel configuration options:
-
-	CONFIG_DNS_RESOLVER	- tristate "DNS Resolver support"
-
-
-==========
-SETTING UP
-==========
-
-To set up this facility, the /etc/request-key.conf file must be altered so that
-/sbin/request-key can appropriately direct the upcalls.  For example, to handle
-basic dname to IPv4/IPv6 address resolution, the following line should be
-added:
-
-	#OP	TYPE		DESC	CO-INFO	PROGRAM ARG1 ARG2 ARG3 ...
-	#======	============	=======	=======	==========================
-	create	dns_resolver  	*	*	/usr/sbin/cifs.upcall %k
-
-To direct a query for query type 'foo', a line of the following should be added
-before the more general line given above as the first match is the one taken.
-
-	create	dns_resolver  	foo:*	*	/usr/sbin/dns.foo %k
-
-
-=====
-USAGE
-=====
-
-To make use of this facility, one of the following functions that are
-implemented in the module can be called after doing:
-
-	#include <linux/dns_resolver.h>
-
- (1) int dns_query(const char *type, const char *name, size_t namelen,
-		   const char *options, char **_result, time_t *_expiry);
-
-     This is the basic access function.  It looks for a cached DNS query and if
-     it doesn't find it, it upcalls to userspace to make a new DNS query, which
-     may then be cached.  The key description is constructed as a string of the
-     form:
-
-		[<type>:]<name>
-
-     where <type> optionally specifies the particular upcall program to invoke,
-     and thus the type of query to do, and <name> specifies the string to be
-     looked up.  The default query type is a straight hostname to IP address
-     set lookup.
-
-     The name parameter is not required to be a NUL-terminated string, and its
-     length should be given by the namelen argument.
-
-     The options parameter may be NULL or it may be a set of options
-     appropriate to the query type.
-
-     The return value is a string appropriate to the query type.  For instance,
-     for the default query type it is just a list of comma-separated IPv4 and
-     IPv6 addresses.  The caller must free the result.
-
-     The length of the result string is returned on success, and a negative
-     error code is returned otherwise.  -EKEYREJECTED will be returned if the
-     DNS lookup failed.
-
-     If _expiry is non-NULL, the expiry time (TTL) of the result will be
-     returned also.
-
-The kernel maintains an internal keyring in which it caches looked up keys.
-This can be cleared by any process that has the CAP_SYS_ADMIN capability by
-the use of KEYCTL_KEYRING_CLEAR on the keyring ID.
-
-
-===============================
-READING DNS KEYS FROM USERSPACE
-===============================
-
-Keys of dns_resolver type can be read from userspace using keyctl_read() or
-"keyctl read/print/pipe".
-
-
-=========
-MECHANISM
-=========
-
-The dnsresolver module registers a key type called "dns_resolver".  Keys of
-this type are used to transport and cache DNS lookup results from userspace.
-
-When dns_query() is invoked, it calls request_key() to search the local
-keyrings for a cached DNS result.  If that fails to find one, it upcalls to
-userspace to get a new result.
-
-Upcalls to userspace are made through the request_key() upcall vector, and are
-directed by means of configuration lines in /etc/request-key.conf that tell
-/sbin/request-key what program to run to instantiate the key.
-
-The upcall handler program is responsible for querying the DNS, processing the
-result into a form suitable for passing to the keyctl_instantiate_key()
-routine.  This then passes the data to dns_resolver_instantiate() which strips
-off and processes any options included in the data, and then attaches the
-remainder of the string to the key as its payload.
-
-The upcall handler program should set the expiry time on the key to that of the
-lowest TTL of all the records it has extracted a result from.  This means that
-the key will be discarded and recreated when the data it holds has expired.
-
-dns_query() returns a copy of the value attached to the key, or an error if
-that is indicated instead.
-
-See <file:Documentation/security/keys/request-key.rst> for further
-information about request-key function.
-
-
-=========
-DEBUGGING
-=========
-
-Debugging messages can be turned on dynamically by writing a 1 into the
-following file:
-
-        /sys/module/dnsresolver/parameters/debug
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index c893127004b9..55746038a7e9 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -52,6 +52,7 @@ Contents:
    dctcp
    decnet
    defza
+   dns_resolver
 
 .. only::  subproject and html
 
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index 2e8e6f904920..d7bec7adc267 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -39,6 +39,6 @@ config CEPH_LIB_USE_DNS_RESOLVER
 	  be resolved using the CONFIG_DNS_RESOLVER facility.
 
 	  For information on how to use CONFIG_DNS_RESOLVER consult
-	  Documentation/networking/dns_resolver.txt
+	  Documentation/networking/dns_resolver.rst
 
 	  If unsure, say N.
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
index 0a1c2238b4bd..255df9b6e9e8 100644
--- a/net/dns_resolver/Kconfig
+++ b/net/dns_resolver/Kconfig
@@ -19,7 +19,7 @@ config DNS_RESOLVER
 	  SMB2 later.  DNS Resolver is supported by the userspace upcall
 	  helper "/sbin/dns.resolver" via /etc/request-key.conf.
 
-	  See <file:Documentation/networking/dns_resolver.txt> for further
+	  See <file:Documentation/networking/dns_resolver.rst> for further
 	  information.
 
 	  To compile this as a module, choose M here: the module will be called
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index ad53eb31d40f..3aced951d5ab 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -1,6 +1,6 @@
 /* Key type used to cache DNS lookups made by the kernel
  *
- * See Documentation/networking/dns_resolver.txt
+ * See Documentation/networking/dns_resolver.rst
  *
  *   Copyright (c) 2007 Igor Mammedov
  *   Author(s): Igor Mammedov (niallain@gmail.com)
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index cab4e0df924f..82b084cc1cc6 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -1,7 +1,7 @@
 /* Upcall routine, designed to work as a key type and working through
  * /sbin/request-key to contact userspace when handling DNS queries.
  *
- * See Documentation/networking/dns_resolver.txt
+ * See Documentation/networking/dns_resolver.rst
  *
  *   Copyright (c) 2007 Igor Mammedov
  *   Author(s): Igor Mammedov (niallain@gmail.com)
-- 
cgit v1.2.3


From 8c498935585680284e5f3e5294d3c901b7c89d57 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:41 +0200
Subject: docs: networking: convert gen_stats.txt to ReST

- add SPDX header;
- mark code blocks and literals as such;
- mark tables as such;
- mark lists as such;
- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/gen_stats.rst | 129 +++++++++++++++++++++++++++++++++
 Documentation/networking/gen_stats.txt | 119 ------------------------------
 Documentation/networking/index.rst     |   1 +
 net/core/gen_stats.c                   |   2 +-
 4 files changed, 131 insertions(+), 120 deletions(-)
 create mode 100644 Documentation/networking/gen_stats.rst
 delete mode 100644 Documentation/networking/gen_stats.txt

(limited to 'net')

diff --git a/Documentation/networking/gen_stats.rst b/Documentation/networking/gen_stats.rst
new file mode 100644
index 000000000000..595a83b9a61b
--- /dev/null
+++ b/Documentation/networking/gen_stats.rst
@@ -0,0 +1,129 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================================
+Generic networking statistics for netlink users
+===============================================
+
+Statistic counters are grouped into structs:
+
+==================== ===================== =====================
+Struct               TLV type              Description
+==================== ===================== =====================
+gnet_stats_basic     TCA_STATS_BASIC       Basic statistics
+gnet_stats_rate_est  TCA_STATS_RATE_EST    Rate estimator
+gnet_stats_queue     TCA_STATS_QUEUE       Queue statistics
+none                 TCA_STATS_APP         Application specific
+==================== ===================== =====================
+
+
+Collecting:
+-----------
+
+Declare the statistic structs you need::
+
+	struct mystruct {
+		struct gnet_stats_basic	bstats;
+		struct gnet_stats_queue	qstats;
+		...
+	};
+
+Update statistics, in dequeue() methods only, (while owning qdisc->running)::
+
+	mystruct->tstats.packet++;
+	mystruct->qstats.backlog += skb->pkt_len;
+
+
+Export to userspace (Dump):
+---------------------------
+
+::
+
+    my_dumping_routine(struct sk_buff *skb, ...)
+    {
+	    struct gnet_dump dump;
+
+	    if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump,
+				    TCA_PAD) < 0)
+		    goto rtattr_failure;
+
+	    if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 ||
+		gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 ||
+		    gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0)
+		    goto rtattr_failure;
+
+	    if (gnet_stats_finish_copy(&dump) < 0)
+		    goto rtattr_failure;
+	    ...
+    }
+
+TCA_STATS/TCA_XSTATS backward compatibility:
+--------------------------------------------
+
+Prior users of struct tc_stats and xstats can maintain backward
+compatibility by calling the compat wrappers to keep providing the
+existing TLV types::
+
+    my_dumping_routine(struct sk_buff *skb, ...)
+    {
+	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
+					TCA_XSTATS, &mystruct->lock, &dump,
+					TCA_PAD) < 0)
+		    goto rtattr_failure;
+	    ...
+    }
+
+A struct tc_stats will be filled out during gnet_stats_copy_* calls
+and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app
+was called.
+
+
+Locking:
+--------
+
+Locks are taken before writing and released once all statistics have
+been written. Locks are always released in case of an error. You
+are responsible for making sure that the lock is initialized.
+
+
+Rate Estimator:
+---------------
+
+0) Prepare an estimator attribute. Most likely this would be in user
+   space. The value of this TLV should contain a tc_estimator structure.
+   As usual, such a TLV needs to be 32 bit aligned and therefore the
+   length needs to be appropriately set, etc. The estimator interval
+   and ewma log need to be converted to the appropriate values.
+   tc_estimator.c::tc_setup_estimator() is advisable to be used as the
+   conversion routine. It does a few clever things. It takes a time
+   interval in microsecs, a time constant also in microsecs and a struct
+   tc_estimator to  be populated. The returned tc_estimator can be
+   transported to the kernel.  Transfer such a structure in a TLV of type
+   TCA_RATE to your code in the kernel.
+
+In the kernel when setting up:
+
+1) make sure you have basic stats and rate stats setup first.
+2) make sure you have initialized stats lock that is used to setup such
+   stats.
+3) Now initialize a new estimator::
+
+    int ret = gen_new_estimator(my_basicstats,my_rate_est_stats,
+	mystats_lock, attr_with_tcestimator_struct);
+
+    if ret == 0
+	success
+    else
+	failed
+
+From now on, every time you dump my_rate_est_stats it will contain
+up-to-date info.
+
+Once you are done, call gen_kill_estimator(my_basicstats,
+my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats
+are still valid (i.e still exist) at the time of making this call.
+
+
+Authors:
+--------
+- Thomas Graf <tgraf@suug.ch>
+- Jamal Hadi Salim <hadi@cyberus.ca>
diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt
deleted file mode 100644
index 179b18ce45ff..000000000000
--- a/Documentation/networking/gen_stats.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-Generic networking statistics for netlink users
-======================================================================
-
-Statistic counters are grouped into structs:
-
-Struct               TLV type              Description
-----------------------------------------------------------------------
-gnet_stats_basic     TCA_STATS_BASIC       Basic statistics
-gnet_stats_rate_est  TCA_STATS_RATE_EST    Rate estimator
-gnet_stats_queue     TCA_STATS_QUEUE       Queue statistics
-none                 TCA_STATS_APP         Application specific
-
-
-Collecting:
------------
-
-Declare the statistic structs you need:
-struct mystruct {
-	struct gnet_stats_basic	bstats;
-	struct gnet_stats_queue	qstats;
-	...
-};
-
-Update statistics, in dequeue() methods only, (while owning qdisc->running)
-mystruct->tstats.packet++;
-mystruct->qstats.backlog += skb->pkt_len;
-
-
-Export to userspace (Dump):
----------------------------
-
-my_dumping_routine(struct sk_buff *skb, ...)
-{
-	struct gnet_dump dump;
-
-	if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump,
-				  TCA_PAD) < 0)
-		goto rtattr_failure;
-
-	if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 ||
-	    gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 ||
-		gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0)
-		goto rtattr_failure;
-
-	if (gnet_stats_finish_copy(&dump) < 0)
-		goto rtattr_failure;
-	...
-}
-
-TCA_STATS/TCA_XSTATS backward compatibility:
---------------------------------------------
-
-Prior users of struct tc_stats and xstats can maintain backward
-compatibility by calling the compat wrappers to keep providing the
-existing TLV types.
-
-my_dumping_routine(struct sk_buff *skb, ...)
-{
-    if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-				     TCA_XSTATS, &mystruct->lock, &dump,
-				     TCA_PAD) < 0)
-		goto rtattr_failure;
-	...
-}
-
-A struct tc_stats will be filled out during gnet_stats_copy_* calls
-and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app
-was called.
-
-
-Locking:
---------
-
-Locks are taken before writing and released once all statistics have
-been written. Locks are always released in case of an error. You
-are responsible for making sure that the lock is initialized.
-
-
-Rate Estimator:
---------------
-
-0) Prepare an estimator attribute. Most likely this would be in user
-   space. The value of this TLV should contain a tc_estimator structure.
-   As usual, such a TLV needs to be 32 bit aligned and therefore the
-   length needs to be appropriately set, etc. The estimator interval
-   and ewma log need to be converted to the appropriate values.
-   tc_estimator.c::tc_setup_estimator() is advisable to be used as the
-   conversion routine. It does a few clever things. It takes a time
-   interval in microsecs, a time constant also in microsecs and a struct
-   tc_estimator to  be populated. The returned tc_estimator can be
-   transported to the kernel.  Transfer such a structure in a TLV of type
-   TCA_RATE to your code in the kernel.
-
-In the kernel when setting up:
-1) make sure you have basic stats and rate stats setup first.
-2) make sure you have initialized stats lock that is used to setup such
-   stats.
-3) Now initialize a new estimator:
-
-   int ret = gen_new_estimator(my_basicstats,my_rate_est_stats,
-       mystats_lock, attr_with_tcestimator_struct);
-
-   if ret == 0
-       success
-   else
-       failed
-
-From now on, every time you dump my_rate_est_stats it will contain
-up-to-date info.
-
-Once you are done, call gen_kill_estimator(my_basicstats,
-my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats
-are still valid (i.e still exist) at the time of making this call.
-
-
-Authors:
---------
-Thomas Graf <tgraf@suug.ch>
-Jamal Hadi Salim <hadi@cyberus.ca>
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 42e556509e22..33afbb67f3fa 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -61,6 +61,7 @@ Contents:
    framerelay
    generic-hdlc
    generic_netlink
+   gen_stats
 
 .. only::  subproject and html
 
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1d653fbfcf52..e491b083b348 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -6,7 +6,7 @@
  *           Jamal Hadi Salim
  *           Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
- * See Documentation/networking/gen_stats.txt
+ * See Documentation/networking/gen_stats.rst
  */
 
 #include <linux/types.h>
-- 
cgit v1.2.3


From 1cec2cacaaec5d53adc04dd3ecfdb687b26c0e89 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:49 +0200
Subject: docs: networking: convert ip-sysctl.txt to ReST

- add SPDX header;
- adjust titles and chapters, adding proper markups;
- mark code blocks and literals as such;
- mark lists as such;
- mark tables as such;
- use footnote markup;
- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/kernel-parameters.txt |    2 +-
 Documentation/admin-guide/sysctl/net.rst        |    2 +-
 Documentation/networking/index.rst              |    1 +
 Documentation/networking/ip-sysctl.rst          | 2649 +++++++++++++++++++++++
 Documentation/networking/ip-sysctl.txt          | 2374 --------------------
 Documentation/networking/snmp_counter.rst       |    2 +-
 net/Kconfig                                     |    2 +-
 net/ipv4/Kconfig                                |    2 +-
 net/ipv4/icmp.c                                 |    2 +-
 9 files changed, 2656 insertions(+), 2380 deletions(-)
 create mode 100644 Documentation/networking/ip-sysctl.rst
 delete mode 100644 Documentation/networking/ip-sysctl.txt

(limited to 'net')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b23ab11587a6..e37db6f1be64 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4910,7 +4910,7 @@
 			Set the number of tcp_metrics_hash slots.
 			Default value is 8192 or 16384 depending on total
 			ram pages. This is used to specify the TCP metrics
-			cache size. See Documentation/networking/ip-sysctl.txt
+			cache size. See Documentation/networking/ip-sysctl.rst
 			"tcp_no_metrics_save" section for more details.
 
 	tdfx=		[HW,DRM]
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index e043c9213388..84e3348a9543 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -353,7 +353,7 @@ socket's buffer. It will not take effect unless PF_UNIX flag is specified.
 
 3. /proc/sys/net/ipv4 - IPV4 settings
 -------------------------------------
-Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
+Please see: Documentation/networking/ip-sysctl.rst and ipvs-sysctl.txt for
 descriptions of these entries.
 
 
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 3efb4608649a..7d133d8dbe2a 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -69,6 +69,7 @@ Contents:
    ip_dynaddr
    iphase
    ipsec
+   ip-sysctl
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
new file mode 100644
index 000000000000..38f811d4b2f0
--- /dev/null
+++ b/Documentation/networking/ip-sysctl.rst
@@ -0,0 +1,2649 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========
+IP Sysctl
+=========
+
+/proc/sys/net/ipv4/* Variables
+==============================
+
+ip_forward - BOOLEAN
+	- 0 - disabled (default)
+	- not 0 - enabled
+
+	Forward Packets between interfaces.
+
+	This variable is special, its change resets all configuration
+	parameters to their default state (RFC1122 for hosts, RFC1812
+	for routers)
+
+ip_default_ttl - INTEGER
+	Default value of TTL field (Time To Live) for outgoing (but not
+	forwarded) IP packets. Should be between 1 and 255 inclusive.
+	Default: 64 (as recommended by RFC1700)
+
+ip_no_pmtu_disc - INTEGER
+	Disable Path MTU Discovery. If enabled in mode 1 and a
+	fragmentation-required ICMP is received, the PMTU to this
+	destination will be set to min_pmtu (see below). You will need
+	to raise min_pmtu to the smallest interface MTU on your system
+	manually if you want to avoid locally generated fragments.
+
+	In mode 2 incoming Path MTU Discovery messages will be
+	discarded. Outgoing frames are handled the same as in mode 1,
+	implicitly setting IP_PMTUDISC_DONT on every created socket.
+
+	Mode 3 is a hardened pmtu discover mode. The kernel will only
+	accept fragmentation-needed errors if the underlying protocol
+	can verify them besides a plain socket lookup. Current
+	protocols for which pmtu events will be honored are TCP, SCTP
+	and DCCP as they verify e.g. the sequence number or the
+	association. This mode should not be enabled globally but is
+	only intended to secure e.g. name servers in namespaces where
+	TCP path mtu must still work but path MTU information of other
+	protocols should be discarded. If enabled globally this mode
+	could break other protocols.
+
+	Possible values: 0-3
+
+	Default: FALSE
+
+min_pmtu - INTEGER
+	default 552 - minimum discovered Path MTU
+
+ip_forward_use_pmtu - BOOLEAN
+	By default we don't trust protocol path MTUs while forwarding
+	because they could be easily forged and can lead to unwanted
+	fragmentation by the router.
+	You only need to enable this if you have user-space software
+	which tries to discover path mtus by itself and depends on the
+	kernel honoring this information. This is normally not the
+	case.
+
+	Default: 0 (disabled)
+
+	Possible values:
+
+	- 0 - disabled
+	- 1 - enabled
+
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv4 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMP echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+
+	Default: 0
+
+fib_multipath_use_neigh - BOOLEAN
+	Use status of existing neighbor entry when determining nexthop for
+	multipath routes. If disabled, neighbor information is not used and
+	packets could be directed to a failed nexthop. Only valid for kernels
+	built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+
+	Default: 0 (disabled)
+
+	Possible values:
+
+	- 0 - disabled
+	- 1 - enabled
+
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes. Only valid
+	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+
+	Default: 0 (Layer 3)
+
+	Possible values:
+
+	- 0 - Layer 3
+	- 1 - Layer 4
+	- 2 - Layer 3 or inner Layer 3 if present
+
+fib_sync_mem - UNSIGNED INTEGER
+	Amount of dirty memory from fib entries that can be backlogged before
+	synchronize_rcu is forced.
+
+	Default: 512kB   Minimum: 64kB   Maximum: 64MB
+
+ip_forward_update_priority - INTEGER
+	Whether to update SKB priority from "TOS" field in IPv4 header after it
+	is forwarded. The new SKB priority is mapped from TOS field value
+	according to an rt_tos2priority table (see e.g. man tc-prio).
+
+	Default: 1 (Update priority.)
+
+	Possible values:
+
+	- 0 - Do not update priority.
+	- 1 - Update priority.
+
+route/max_size - INTEGER
+	Maximum number of routes allowed in the kernel.  Increase
+	this when using large numbers of interfaces and/or routes.
+
+	From linux kernel 3.6 onwards, this is deprecated for ipv4
+	as route cache is no longer used.
+
+neigh/default/gc_thresh1 - INTEGER
+	Minimum number of entries to keep.  Garbage collector will not
+	purge entries if there are fewer than this number.
+
+	Default: 128
+
+neigh/default/gc_thresh2 - INTEGER
+	Threshold when garbage collector becomes more aggressive about
+	purging entries. Entries older than 5 seconds will be cleared
+	when over this number.
+
+	Default: 512
+
+neigh/default/gc_thresh3 - INTEGER
+	Maximum number of non-PERMANENT neighbor entries allowed.  Increase
+	this when using large numbers of interfaces and when communicating
+	with large numbers of directly-connected peers.
+
+	Default: 1024
+
+neigh/default/unres_qlen_bytes - INTEGER
+	The maximum number of bytes which may be used by packets
+	queued for each	unresolved address by other network layers.
+	(added in linux 3.3)
+
+	Setting negative value is meaningless and will return error.
+
+	Default: SK_WMEM_MAX, (same as net.core.wmem_default).
+
+		Exact value depends on architecture and kernel options,
+		but should be enough to allow queuing 256 packets
+		of medium size.
+
+neigh/default/unres_qlen - INTEGER
+	The maximum number of packets which may be queued for each
+	unresolved address by other network layers.
+
+	(deprecated in linux 3.3) : use unres_qlen_bytes instead.
+
+	Prior to linux 3.3, the default value is 3 which may cause
+	unexpected packet loss. The current default value is calculated
+	according to default value of unres_qlen_bytes and true size of
+	packet.
+
+	Default: 101
+
+mtu_expires - INTEGER
+	Time, in seconds, that cached PMTU information is kept.
+
+min_adv_mss - INTEGER
+	The advertised MSS depends on the first hop route MTU, but will
+	never be lower than this setting.
+
+IP Fragmentation:
+
+ipfrag_high_thresh - LONG INTEGER
+	Maximum memory used to reassemble IP fragments.
+
+ipfrag_low_thresh - LONG INTEGER
+	(Obsolete since linux-4.17)
+	Maximum memory used to reassemble IP fragments before the kernel
+	begins to remove incomplete fragment queues to free up resources.
+	The kernel still accepts new fragments for defragmentation.
+
+ipfrag_time - INTEGER
+	Time in seconds to keep an IP fragment in memory.
+
+ipfrag_max_dist - INTEGER
+	ipfrag_max_dist is a non-negative integer value which defines the
+	maximum "disorder" which is allowed among fragments which share a
+	common IP source address. Note that reordering of packets is
+	not unusual, but if a large number of fragments arrive from a source
+	IP address while a particular fragment queue remains incomplete, it
+	probably indicates that one or more fragments belonging to that queue
+	have been lost. When ipfrag_max_dist is positive, an additional check
+	is done on fragments before they are added to a reassembly queue - if
+	ipfrag_max_dist (or more) fragments have arrived from a particular IP
+	address between additions to any IP fragment queue using that source
+	address, it's presumed that one or more fragments in the queue are
+	lost. The existing fragment queue will be dropped, and a new one
+	started. An ipfrag_max_dist value of zero disables this check.
+
+	Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+	result in unnecessarily dropping fragment queues when normal
+	reordering of packets occurs, which could lead to poor application
+	performance. Using a very large value, e.g. 50000, increases the
+	likelihood of incorrectly reassembling IP fragments that originate
+	from different IP datagrams, which could result in data corruption.
+	Default: 64
+
+INET peer storage
+=================
+
+inet_peer_threshold - INTEGER
+	The approximate size of the storage.  Starting from this threshold
+	entries will be thrown aggressively.  This threshold also determines
+	entries' time-to-live and time intervals between garbage collection
+	passes.  More entries, less time-to-live, less GC interval.
+
+inet_peer_minttl - INTEGER
+	Minimum time-to-live of entries.  Should be enough to cover fragment
+	time-to-live on the reassembling side.  This minimum time-to-live  is
+	guaranteed if the pool size is less than inet_peer_threshold.
+	Measured in seconds.
+
+inet_peer_maxttl - INTEGER
+	Maximum time-to-live of entries.  Unused entries will expire after
+	this period of time if there is no memory pressure on the pool (i.e.
+	when the number of entries in the pool is very small).
+	Measured in seconds.
+
+TCP variables
+=============
+
+somaxconn - INTEGER
+	Limit of socket listen() backlog, known in userspace as SOMAXCONN.
+	Defaults to 4096. (Was 128 before linux-5.4)
+	See also tcp_max_syn_backlog for additional tuning for TCP sockets.
+
+tcp_abort_on_overflow - BOOLEAN
+	If listening service is too slow to accept new connections,
+	reset them. Default state is FALSE. It means that if overflow
+	occurred due to a burst, connection will recover. Enable this
+	option _only_ if you are really sure that listening daemon
+	cannot be tuned to accept connections faster. Enabling this
+	option can harm clients of your server.
+
+tcp_adv_win_scale - INTEGER
+	Count buffering overhead as bytes/2^tcp_adv_win_scale
+	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
+	if it is <= 0.
+
+	Possible values are [-31, 31], inclusive.
+
+	Default: 1
+
+tcp_allowed_congestion_control - STRING
+	Show/set the congestion control choices available to non-privileged
+	processes. The list is a subset of those listed in
+	tcp_available_congestion_control.
+
+	Default is "reno" and the default setting (tcp_congestion_control).
+
+tcp_app_win - INTEGER
+	Reserve max(window/2^tcp_app_win, mss) of window for application
+	buffer. Value 0 is special, it means that nothing is reserved.
+
+	Default: 31
+
+tcp_autocorking - BOOLEAN
+	Enable TCP auto corking :
+	When applications do consecutive small write()/sendmsg() system calls,
+	we try to coalesce these small writes as much as possible, to lower
+	total amount of sent packets. This is done if at least one prior
+	packet for the flow is waiting in Qdisc queues or device transmit
+	queue. Applications can still use TCP_CORK for optimal behavior
+	when they know how/when to uncork their sockets.
+
+	Default : 1
+
+tcp_available_congestion_control - STRING
+	Shows the available congestion control choices that are registered.
+	More congestion control algorithms may be available as modules,
+	but not loaded.
+
+tcp_base_mss - INTEGER
+	The initial value of search_low to be used by the packetization layer
+	Path MTU discovery (MTU probing).  If MTU probing is enabled,
+	this is the initial MSS used by the connection.
+
+tcp_mtu_probe_floor - INTEGER
+	If MTU probing is enabled this caps the minimum MSS used for search_low
+	for the connection.
+
+	Default : 48
+
+tcp_min_snd_mss - INTEGER
+	TCP SYN and SYNACK messages usually advertise an ADVMSS option,
+	as described in RFC 1122 and RFC 6691.
+
+	If this ADVMSS option is smaller than tcp_min_snd_mss,
+	it is silently capped to tcp_min_snd_mss.
+
+	Default : 48 (at least 8 bytes of payload per segment)
+
+tcp_congestion_control - STRING
+	Set the congestion control algorithm to be used for new
+	connections. The algorithm "reno" is always available, but
+	additional choices may be available based on kernel configuration.
+	Default is set as part of kernel configuration.
+	For passive connections, the listener congestion control choice
+	is inherited.
+
+	[see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ]
+
+tcp_dsack - BOOLEAN
+	Allows TCP to send "duplicate" SACKs.
+
+tcp_early_retrans - INTEGER
+	Tail loss probe (TLP) converts RTOs occurring due to tail
+	losses into fast recovery (draft-ietf-tcpm-rack). Note that
+	TLP requires RACK to function properly (see tcp_recovery below)
+
+	Possible values:
+
+		- 0 disables TLP
+		- 3 or 4 enables TLP
+
+	Default: 3
+
+tcp_ecn - INTEGER
+	Control use of Explicit Congestion Notification (ECN) by TCP.
+	ECN is used only when both ends of the TCP connection indicate
+	support for it.  This feature is useful in avoiding losses due
+	to congestion by allowing supporting routers to signal
+	congestion before having to drop packets.
+
+	Possible values are:
+
+		=  =====================================================
+		0  Disable ECN.  Neither initiate nor accept ECN.
+		1  Enable ECN when requested by incoming connections and
+		   also request ECN on outgoing connection attempts.
+		2  Enable ECN when requested by incoming connections
+		   but do not request ECN on outgoing connections.
+		=  =====================================================
+
+	Default: 2
+
+tcp_ecn_fallback - BOOLEAN
+	If the kernel detects that ECN connection misbehaves, enable fall
+	back to non-ECN. Currently, this knob implements the fallback
+	from RFC3168, section 6.1.1.1., but we reserve that in future,
+	additional detection mechanisms could be implemented under this
+	knob. The value	is not used, if tcp_ecn or per route (or congestion
+	control) ECN settings are disabled.
+
+	Default: 1 (fallback enabled)
+
+tcp_fack - BOOLEAN
+	This is a legacy option, it has no effect anymore.
+
+tcp_fin_timeout - INTEGER
+	The length of time an orphaned (no longer referenced by any
+	application) connection will remain in the FIN_WAIT_2 state
+	before it is aborted at the local end.  While a perfectly
+	valid "receive only" state for an un-orphaned connection, an
+	orphaned connection in FIN_WAIT_2 state could otherwise wait
+	forever for the remote to close its end of the connection.
+
+	Cf. tcp_max_orphans
+
+	Default: 60 seconds
+
+tcp_frto - INTEGER
+	Enables Forward RTO-Recovery (F-RTO) defined in RFC5682.
+	F-RTO is an enhanced recovery algorithm for TCP retransmission
+	timeouts.  It is particularly beneficial in networks where the
+	RTT fluctuates (e.g., wireless). F-RTO is sender-side only
+	modification. It does not require any support from the peer.
+
+	By default it's enabled with a non-zero value. 0 disables F-RTO.
+
+tcp_fwmark_accept - BOOLEAN
+	If set, incoming connections to listening sockets that do not have a
+	socket mark will set the mark of the accepting socket to the fwmark of
+	the incoming SYN packet. This will cause all packets on that connection
+	(starting from the first SYNACK) to be sent with that fwmark. The
+	listening socket's mark is unchanged. Listening sockets that already
+	have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+	unaffected.
+
+	Default: 0
+
+tcp_invalid_ratelimit - INTEGER
+	Limit the maximal rate for sending duplicate acknowledgments
+	in response to incoming TCP packets that are for an existing
+	connection but that are invalid due to any of these reasons:
+
+	  (a) out-of-window sequence number,
+	  (b) out-of-window acknowledgment number, or
+	  (c) PAWS (Protection Against Wrapped Sequence numbers) check failure
+
+	This can help mitigate simple "ack loop" DoS attacks, wherein
+	a buggy or malicious middlebox or man-in-the-middle can
+	rewrite TCP header fields in manner that causes each endpoint
+	to think that the other is sending invalid TCP segments, thus
+	causing each side to send an unterminating stream of duplicate
+	acknowledgments for invalid segments.
+
+	Using 0 disables rate-limiting of dupacks in response to
+	invalid segments; otherwise this value specifies the minimal
+	space between sending such dupacks, in milliseconds.
+
+	Default: 500 (milliseconds).
+
+tcp_keepalive_time - INTEGER
+	How often TCP sends out keepalive messages when keepalive is enabled.
+	Default: 2hours.
+
+tcp_keepalive_probes - INTEGER
+	How many keepalive probes TCP sends out, until it decides that the
+	connection is broken. Default value: 9.
+
+tcp_keepalive_intvl - INTEGER
+	How frequently the probes are send out. Multiplied by
+	tcp_keepalive_probes it is time to kill not responding connection,
+	after probes started. Default value: 75sec i.e. connection
+	will be aborted after ~11 minutes of retries.
+
+tcp_l3mdev_accept - BOOLEAN
+	Enables child sockets to inherit the L3 master device index.
+	Enabling this option allows a "global" listen socket to work
+	across L3 master domains (e.g., VRFs) with connected sockets
+	derived from the listen socket to be bound to the L3 domain in
+	which the packets originated. Only valid when the kernel was
+	compiled with CONFIG_NET_L3_MASTER_DEV.
+
+	Default: 0 (disabled)
+
+tcp_low_latency - BOOLEAN
+	This is a legacy option, it has no effect anymore.
+
+tcp_max_orphans - INTEGER
+	Maximal number of TCP sockets not attached to any user file handle,
+	held by system.	If this number is exceeded orphaned connections are
+	reset immediately and warning is printed. This limit exists
+	only to prevent simple DoS attacks, you _must_ not rely on this
+	or lower the limit artificially, but rather increase it
+	(probably, after increasing installed memory),
+	if network conditions require more than default value,
+	and tune network services to linger and kill such states
+	more aggressively. Let me to remind again: each orphan eats
+	up to ~64K of unswappable memory.
+
+tcp_max_syn_backlog - INTEGER
+	Maximal number of remembered connection requests (SYN_RECV),
+	which have not received an acknowledgment from connecting client.
+
+	This is a per-listener limit.
+
+	The minimal value is 128 for low memory machines, and it will
+	increase in proportion to the memory of machine.
+
+	If server suffers from overload, try increasing this number.
+
+	Remember to also check /proc/sys/net/core/somaxconn
+	A SYN_RECV request socket consumes about 304 bytes of memory.
+
+tcp_max_tw_buckets - INTEGER
+	Maximal number of timewait sockets held by system simultaneously.
+	If this number is exceeded time-wait socket is immediately destroyed
+	and warning is printed. This limit exists only to prevent
+	simple DoS attacks, you _must_ not lower the limit artificially,
+	but rather increase it (probably, after increasing installed memory),
+	if network conditions require more than default value.
+
+tcp_mem - vector of 3 INTEGERs: min, pressure, max
+	min: below this number of pages TCP is not bothered about its
+	memory appetite.
+
+	pressure: when amount of memory allocated by TCP exceeds this number
+	of pages, TCP moderates its memory consumption and enters memory
+	pressure mode, which is exited when memory consumption falls
+	under "min".
+
+	max: number of pages allowed for queueing by all TCP sockets.
+
+	Defaults are calculated at boot time from amount of available
+	memory.
+
+tcp_min_rtt_wlen - INTEGER
+	The window length of the windowed min filter to track the minimum RTT.
+	A shorter window lets a flow more quickly pick up new (higher)
+	minimum RTT when it is moved to a longer path (e.g., due to traffic
+	engineering). A longer window makes the filter more resistant to RTT
+	inflations such as transient congestion. The unit is seconds.
+
+	Possible values: 0 - 86400 (1 day)
+
+	Default: 300
+
+tcp_moderate_rcvbuf - BOOLEAN
+	If set, TCP performs receive buffer auto-tuning, attempting to
+	automatically size the buffer (no greater than tcp_rmem[2]) to
+	match the size required by the path for full throughput.  Enabled by
+	default.
+
+tcp_mtu_probing - INTEGER
+	Controls TCP Packetization-Layer Path MTU Discovery.  Takes three
+	values:
+
+	- 0 - Disabled
+	- 1 - Disabled by default, enabled when an ICMP black hole detected
+	- 2 - Always enabled, use initial MSS of tcp_base_mss.
+
+tcp_probe_interval - UNSIGNED INTEGER
+	Controls how often to start TCP Packetization-Layer Path MTU
+	Discovery reprobe. The default is reprobing every 10 minutes as
+	per RFC4821.
+
+tcp_probe_threshold - INTEGER
+	Controls when TCP Packetization-Layer Path MTU Discovery probing
+	will stop in respect to the width of search range in bytes. Default
+	is 8 bytes.
+
+tcp_no_metrics_save - BOOLEAN
+	By default, TCP saves various connection metrics in the route cache
+	when the connection closes, so that connections established in the
+	near future can use these to set initial conditions.  Usually, this
+	increases overall performance, but may sometimes cause performance
+	degradation.  If set, TCP will not cache metrics on closing
+	connections.
+
+tcp_no_ssthresh_metrics_save - BOOLEAN
+	Controls whether TCP saves ssthresh metrics in the route cache.
+
+	Default is 1, which disables ssthresh metrics.
+
+tcp_orphan_retries - INTEGER
+	This value influences the timeout of a locally closed TCP connection,
+	when RTO retransmissions remain unacknowledged.
+	See tcp_retries2 for more details.
+
+	The default value is 8.
+
+	If your machine is a loaded WEB server,
+	you should think about lowering this value, such sockets
+	may consume significant resources. Cf. tcp_max_orphans.
+
+tcp_recovery - INTEGER
+	This value is a bitmap to enable various experimental loss recovery
+	features.
+
+	=========   =============================================================
+	RACK: 0x1   enables the RACK loss detection for fast detection of lost
+		    retransmissions and tail drops. It also subsumes and disables
+		    RFC6675 recovery for SACK connections.
+
+	RACK: 0x2   makes RACK's reordering window static (min_rtt/4).
+
+	RACK: 0x4   disables RACK's DUPACK threshold heuristic
+	=========   =============================================================
+
+	Default: 0x1
+
+tcp_reordering - INTEGER
+	Initial reordering level of packets in a TCP stream.
+	TCP stack can then dynamically adjust flow reordering level
+	between this initial value and tcp_max_reordering
+
+	Default: 3
+
+tcp_max_reordering - INTEGER
+	Maximal reordering level of packets in a TCP stream.
+	300 is a fairly conservative value, but you might increase it
+	if paths are using per packet load balancing (like bonding rr mode)
+
+	Default: 300
+
+tcp_retrans_collapse - BOOLEAN
+	Bug-to-bug compatibility with some broken printers.
+	On retransmit try to send bigger packets to work around bugs in
+	certain TCP stacks.
+
+tcp_retries1 - INTEGER
+	This value influences the time, after which TCP decides, that
+	something is wrong due to unacknowledged RTO retransmissions,
+	and reports this suspicion to the network layer.
+	See tcp_retries2 for more details.
+
+	RFC 1122 recommends at least 3 retransmissions, which is the
+	default.
+
+tcp_retries2 - INTEGER
+	This value influences the timeout of an alive TCP connection,
+	when RTO retransmissions remain unacknowledged.
+	Given a value of N, a hypothetical TCP connection following
+	exponential backoff with an initial RTO of TCP_RTO_MIN would
+	retransmit N times before killing the connection at the (N+1)th RTO.
+
+	The default value of 15 yields a hypothetical timeout of 924.6
+	seconds and is a lower bound for the effective timeout.
+	TCP will effectively time out at the first RTO which exceeds the
+	hypothetical timeout.
+
+	RFC 1122 recommends at least 100 seconds for the timeout,
+	which corresponds to a value of at least 8.
+
+tcp_rfc1337 - BOOLEAN
+	If set, the TCP stack behaves conforming to RFC1337. If unset,
+	we are not conforming to RFC, but prevent TCP TIME_WAIT
+	assassination.
+
+	Default: 0
+
+tcp_rmem - vector of 3 INTEGERs: min, default, max
+	min: Minimal size of receive buffer used by TCP sockets.
+	It is guaranteed to each TCP socket, even under moderate memory
+	pressure.
+
+	Default: 4K
+
+	default: initial size of receive buffer used by TCP sockets.
+	This value overrides net.core.rmem_default used by other protocols.
+	Default: 87380 bytes. This value results in window of 65535 with
+	default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
+	less for default tcp_app_win. See below about these variables.
+
+	max: maximal size of receive buffer allowed for automatically
+	selected receiver buffers for TCP socket. This value does not override
+	net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
+	automatic tuning of that socket's receive buffer size, in which
+	case this value is ignored.
+	Default: between 87380B and 6MB, depending on RAM size.
+
+tcp_sack - BOOLEAN
+	Enable select acknowledgments (SACKS).
+
+tcp_comp_sack_delay_ns - LONG INTEGER
+	TCP tries to reduce number of SACK sent, using a timer
+	based on 5% of SRTT, capped by this sysctl, in nano seconds.
+	The default is 1ms, based on TSO autosizing period.
+
+	Default : 1,000,000 ns (1 ms)
+
+tcp_comp_sack_nr - INTEGER
+	Max number of SACK that can be compressed.
+	Using 0 disables SACK compression.
+
+	Default : 44
+
+tcp_slow_start_after_idle - BOOLEAN
+	If set, provide RFC2861 behavior and time out the congestion
+	window after an idle period.  An idle period is defined at
+	the current RTO.  If unset, the congestion window will not
+	be timed out after an idle period.
+
+	Default: 1
+
+tcp_stdurg - BOOLEAN
+	Use the Host requirements interpretation of the TCP urgent pointer field.
+	Most hosts use the older BSD interpretation, so if you turn this on
+	Linux might not communicate correctly with them.
+
+	Default: FALSE
+
+tcp_synack_retries - INTEGER
+	Number of times SYNACKs for a passive TCP connection attempt will
+	be retransmitted. Should not be higher than 255. Default value
+	is 5, which corresponds to 31seconds till the last retransmission
+	with the current initial RTO of 1second. With this the final timeout
+	for a passive TCP connection will happen after 63seconds.
+
+tcp_syncookies - INTEGER
+	Only valid when the kernel was compiled with CONFIG_SYN_COOKIES
+	Send out syncookies when the syn backlog queue of a socket
+	overflows. This is to prevent against the common 'SYN flood attack'
+	Default: 1
+
+	Note, that syncookies is fallback facility.
+	It MUST NOT be used to help highly loaded servers to stand
+	against legal connection rate. If you see SYN flood warnings
+	in your logs, but investigation	shows that they occur
+	because of overload with legal connections, you should tune
+	another parameters until this warning disappear.
+	See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow.
+
+	syncookies seriously violate TCP protocol, do not allow
+	to use TCP extensions, can result in serious degradation
+	of some services (f.e. SMTP relaying), visible not by you,
+	but your clients and relays, contacting you. While you see
+	SYN flood warnings in logs not being really flooded, your server
+	is seriously misconfigured.
+
+	If you want to test which effects syncookies have to your
+	network connections you can set this knob to 2 to enable
+	unconditionally generation of syncookies.
+
+tcp_fastopen - INTEGER
+	Enable TCP Fast Open (RFC7413) to send and accept data in the opening
+	SYN packet.
+
+	The client support is enabled by flag 0x1 (on by default). The client
+	then must use sendmsg() or sendto() with the MSG_FASTOPEN flag,
+	rather than connect() to send data in SYN.
+
+	The server support is enabled by flag 0x2 (off by default). Then
+	either enable for all listeners with another flag (0x400) or
+	enable individual listeners via TCP_FASTOPEN socket option with
+	the option value being the length of the syn-data backlog.
+
+	The values (bitmap) are
+
+	=====  ======== ======================================================
+	  0x1  (client) enables sending data in the opening SYN on the client.
+	  0x2  (server) enables the server support, i.e., allowing data in
+			a SYN packet to be accepted and passed to the
+			application before 3-way handshake finishes.
+	  0x4  (client) send data in the opening SYN regardless of cookie
+			availability and without a cookie option.
+	0x200  (server) accept data-in-SYN w/o any cookie option present.
+	0x400  (server) enable all listeners to support Fast Open by
+			default without explicit TCP_FASTOPEN socket option.
+	=====  ======== ======================================================
+
+	Default: 0x1
+
+	Note that that additional client or server features are only
+	effective if the basic support (0x1 and 0x2) are enabled respectively.
+
+tcp_fastopen_blackhole_timeout_sec - INTEGER
+	Initial time period in second to disable Fastopen on active TCP sockets
+	when a TFO firewall blackhole issue happens.
+	This time period will grow exponentially when more blackhole issues
+	get detected right after Fastopen is re-enabled and will reset to
+	initial value when the blackhole issue goes away.
+	0 to disable the blackhole detection.
+
+	By default, it is set to 1hr.
+
+tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
+	The list consists of a primary key and an optional backup key. The
+	primary key is used for both creating and validating cookies, while the
+	optional backup key is only used for validating cookies. The purpose of
+	the backup key is to maximize TFO validation when keys are rotated.
+
+	A randomly chosen primary key may be configured by the kernel if
+	the tcp_fastopen sysctl is set to 0x400 (see above), or if the
+	TCP_FASTOPEN setsockopt() optname is set and a key has not been
+	previously configured via sysctl. If keys are configured via
+	setsockopt() by using the TCP_FASTOPEN_KEY optname, then those
+	per-socket keys will be used instead of any keys that are specified via
+	sysctl.
+
+	A key is specified as 4 8-digit hexadecimal integers which are separated
+	by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be
+	omitted. A primary and a backup key may be specified by separating them
+	by a comma. If only one key is specified, it becomes the primary key and
+	any previously configured backup keys are removed.
+
+tcp_syn_retries - INTEGER
+	Number of times initial SYNs for an active TCP connection attempt
+	will be retransmitted. Should not be higher than 127. Default value
+	is 6, which corresponds to 63seconds till the last retransmission
+	with the current initial RTO of 1second. With this the final timeout
+	for an active TCP connection attempt will happen after 127seconds.
+
+tcp_timestamps - INTEGER
+	Enable timestamps as defined in RFC1323.
+
+	- 0: Disabled.
+	- 1: Enable timestamps as defined in RFC1323 and use random offset for
+	  each connection rather than only using the current time.
+	- 2: Like 1, but without random offsets.
+
+	Default: 1
+
+tcp_min_tso_segs - INTEGER
+	Minimal number of segments per TSO frame.
+
+	Since linux-3.12, TCP does an automatic sizing of TSO frames,
+	depending on flow rate, instead of filling 64Kbytes packets.
+	For specific usages, it's possible to force TCP to build big
+	TSO frames. Note that TCP stack might split too big TSO packets
+	if available window is too small.
+
+	Default: 2
+
+tcp_pacing_ss_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in slow start, tcp_pacing_ss_ratio is applied
+	to let TCP probe for bigger speeds, assuming cwnd can be
+	doubled every other RTT.
+
+	Default: 200
+
+tcp_pacing_ca_ratio - INTEGER
+	sk->sk_pacing_rate is set by TCP stack using a ratio applied
+	to current rate. (current_rate = cwnd * mss / srtt)
+	If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
+	is applied to conservatively probe for bigger throughput.
+
+	Default: 120
+
+tcp_tso_win_divisor - INTEGER
+	This allows control over what percentage of the congestion window
+	can be consumed by a single TSO frame.
+	The setting of this parameter is a choice between burstiness and
+	building larger TSO frames.
+
+	Default: 3
+
+tcp_tw_reuse - INTEGER
+	Enable reuse of TIME-WAIT sockets for new connections when it is
+	safe from protocol viewpoint.
+
+	- 0 - disable
+	- 1 - global enable
+	- 2 - enable for loopback traffic only
+
+	It should not be changed without advice/request of technical
+	experts.
+
+	Default: 2
+
+tcp_window_scaling - BOOLEAN
+	Enable window scaling as defined in RFC1323.
+
+tcp_wmem - vector of 3 INTEGERs: min, default, max
+	min: Amount of memory reserved for send buffers for TCP sockets.
+	Each TCP socket has rights to use it due to fact of its birth.
+
+	Default: 4K
+
+	default: initial size of send buffer used by TCP sockets.  This
+	value overrides net.core.wmem_default used by other protocols.
+
+	It is usually lower than net.core.wmem_default.
+
+	Default: 16K
+
+	max: Maximal amount of memory allowed for automatically tuned
+	send buffers for TCP sockets. This value does not override
+	net.core.wmem_max.  Calling setsockopt() with SO_SNDBUF disables
+	automatic tuning of that socket's send buffer size, in which case
+	this value is ignored.
+
+	Default: between 64K and 4MB, depending on RAM size.
+
+tcp_notsent_lowat - UNSIGNED INTEGER
+	A TCP socket can control the amount of unsent bytes in its write queue,
+	thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
+	reports POLLOUT events if the amount of unsent bytes is below a per
+	socket value, and if the write queue is not full. sendmsg() will
+	also not add new buffers if the limit is hit.
+
+	This global variable controls the amount of unsent data for
+	sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
+	to the global variable has immediate effect.
+
+	Default: UINT_MAX (0xFFFFFFFF)
+
+tcp_workaround_signed_windows - BOOLEAN
+	If set, assume no receipt of a window scaling option means the
+	remote TCP is broken and treats the window as a signed quantity.
+	If unset, assume the remote TCP is not broken even if we do
+	not receive a window scaling option from them.
+
+	Default: 0
+
+tcp_thin_linear_timeouts - BOOLEAN
+	Enable dynamic triggering of linear timeouts for thin streams.
+	If set, a check is performed upon retransmission by timeout to
+	determine if the stream is thin (less than 4 packets in flight).
+	As long as the stream is found to be thin, up to 6 linear
+	timeouts may be performed before exponential backoff mode is
+	initiated. This improves retransmission latency for
+	non-aggressive thin streams, often found to be time-dependent.
+	For more information on thin streams, see
+	Documentation/networking/tcp-thin.txt
+
+	Default: 0
+
+tcp_limit_output_bytes - INTEGER
+	Controls TCP Small Queue limit per tcp socket.
+	TCP bulk sender tends to increase packets in flight until it
+	gets losses notifications. With SNDBUF autotuning, this can
+	result in a large amount of packets queued on the local machine
+	(e.g.: qdiscs, CPU backlog, or device) hurting latency of other
+	flows, for typical pfifo_fast qdiscs.  tcp_limit_output_bytes
+	limits the number of bytes on qdisc or device to reduce artificial
+	RTT/cwnd and reduce bufferbloat.
+
+	Default: 1048576 (16 * 65536)
+
+tcp_challenge_ack_limit - INTEGER
+	Limits number of Challenge ACK sent per second, as recommended
+	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
+	Default: 1000
+
+tcp_rx_skb_cache - BOOLEAN
+	Controls a per TCP socket cache of one skb, that might help
+	performance of some workloads. This might be dangerous
+	on systems with a lot of TCP sockets, since it increases
+	memory usage.
+
+	Default: 0 (disabled)
+
+UDP variables
+=============
+
+udp_l3mdev_accept - BOOLEAN
+	Enabling this option allows a "global" bound socket to work
+	across L3 master domains (e.g., VRFs) with packets capable of
+	being received regardless of the L3 domain in which they
+	originated. Only valid when the kernel was compiled with
+	CONFIG_NET_L3_MASTER_DEV.
+
+	Default: 0 (disabled)
+
+udp_mem - vector of 3 INTEGERs: min, pressure, max
+	Number of pages allowed for queueing by all UDP sockets.
+
+	min: Below this number of pages UDP is not bothered about its
+	memory appetite. When amount of memory allocated by UDP exceeds
+	this number, UDP starts to moderate memory usage.
+
+	pressure: This value was introduced to follow format of tcp_mem.
+
+	max: Number of pages allowed for queueing by all UDP sockets.
+
+	Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+	Minimal size of receive buffer used by UDP sockets in moderation.
+	Each UDP socket is able to use the size for receiving data, even if
+	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+
+	Default: 4K
+
+udp_wmem_min - INTEGER
+	Minimal size of send buffer used by UDP sockets in moderation.
+	Each UDP socket is able to use the size for sending data, even if
+	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+
+	Default: 4K
+
+RAW variables
+=============
+
+raw_l3mdev_accept - BOOLEAN
+	Enabling this option allows a "global" bound socket to work
+	across L3 master domains (e.g., VRFs) with packets capable of
+	being received regardless of the L3 domain in which they
+	originated. Only valid when the kernel was compiled with
+	CONFIG_NET_L3_MASTER_DEV.
+
+	Default: 1 (enabled)
+
+CIPSOv4 Variables
+=================
+
+cipso_cache_enable - BOOLEAN
+	If set, enable additions to and lookups from the CIPSO label mapping
+	cache.  If unset, additions are ignored and lookups always result in a
+	miss.  However, regardless of the setting the cache is still
+	invalidated when required when means you can safely toggle this on and
+	off and the cache will always be "safe".
+
+	Default: 1
+
+cipso_cache_bucket_size - INTEGER
+	The CIPSO label cache consists of a fixed size hash table with each
+	hash bucket containing a number of cache entries.  This variable limits
+	the number of entries in each hash bucket; the larger the value the
+	more CIPSO label mappings that can be cached.  When the number of
+	entries in a given hash bucket reaches this limit adding new entries
+	causes the oldest entry in the bucket to be removed to make room.
+
+	Default: 10
+
+cipso_rbm_optfmt - BOOLEAN
+	Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of
+	the CIPSO draft specification (see Documentation/netlabel for details).
+	This means that when set the CIPSO tag will be padded with empty
+	categories in order to make the packet data 32-bit aligned.
+
+	Default: 0
+
+cipso_rbm_structvalid - BOOLEAN
+	If set, do a very strict check of the CIPSO option when
+	ip_options_compile() is called.  If unset, relax the checks done during
+	ip_options_compile().  Either way is "safe" as errors are caught else
+	where in the CIPSO processing code but setting this to 0 (False) should
+	result in less work (i.e. it should be faster) but could cause problems
+	with other implementations that require strict checking.
+
+	Default: 0
+
+IP Variables
+============
+
+ip_local_port_range - 2 INTEGERS
+	Defines the local port range that is used by TCP and UDP to
+	choose the local port. The first number is the first, the
+	second the last local port number.
+	If possible, it is better these numbers have different parity
+	(one even and one odd value).
+	Must be greater than or equal to ip_unprivileged_port_start.
+	The default values are 32768 and 60999 respectively.
+
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.::
+
+	    $ cat /proc/sys/net/ipv4/ip_local_port_range
+	    32000	60999
+	    $ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	    8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
+ip_unprivileged_port_start - INTEGER
+	This is a per-namespace sysctl.  It defines the first
+	unprivileged port in the network namespace.  Privileged ports
+	require root or CAP_NET_BIND_SERVICE in order to bind to them.
+	To disable all privileged ports, set this to 0.  They must not
+	overlap with the ip_local_port_range.
+
+	Default: 1024
+
+ip_nonlocal_bind - BOOLEAN
+	If set, allows processes to bind() to non-local IP addresses,
+	which can be quite useful - but may break some applications.
+
+	Default: 0
+
+ip_autobind_reuse - BOOLEAN
+	By default, bind() does not select the ports automatically even if
+	the new socket and all sockets bound to the port have SO_REUSEADDR.
+	ip_autobind_reuse allows bind() to reuse the port and this is useful
+	when you use bind()+connect(), but may break some applications.
+	The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
+	option should only be set by experts.
+	Default: 0
+
+ip_dynaddr - BOOLEAN
+	If set non-zero, enables support for dynamic addresses.
+	If set to a non-zero value larger than 1, a kernel log
+	message will be printed when dynamic address rewriting
+	occurs.
+
+	Default: 0
+
+ip_early_demux - BOOLEAN
+	Optimize input packet processing down to one demux for
+	certain kinds of local sockets.  Currently we only do this
+	for established TCP and connected UDP sockets.
+
+	It may add an additional cost for pure routing workloads that
+	reduces overall throughput, in such case you should disable it.
+
+	Default: 1
+
+ping_group_range - 2 INTEGERS
+	Restrict ICMP_PROTO datagram sockets to users in the group range.
+	The default is "1 0", meaning, that nobody (not even root) may
+	create ping sockets.  Setting it to "100 100" would grant permissions
+	to the single group. "0 4294967295" would enable it for the world, "100
+	4294967295" would enable it for the users, but not daemons.
+
+tcp_early_demux - BOOLEAN
+	Enable early demux for established TCP sockets.
+
+	Default: 1
+
+udp_early_demux - BOOLEAN
+	Enable early demux for connected UDP sockets. Disable this if
+	your system could experience more unconnected load.
+
+	Default: 1
+
+icmp_echo_ignore_all - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO
+	requests sent to it.
+
+	Default: 0
+
+icmp_echo_ignore_broadcasts - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO and
+	TIMESTAMP requests sent to it via broadcast/multicast.
+
+	Default: 1
+
+icmp_ratelimit - INTEGER
+	Limit the maximal rates for sending ICMP packets whose type matches
+	icmp_ratemask (see below) to specific targets.
+	0 to disable any limiting,
+	otherwise the minimal space between responses in milliseconds.
+	Note that another sysctl, icmp_msgs_per_sec limits the number
+	of ICMP packets	sent on all targets.
+
+	Default: 1000
+
+icmp_msgs_per_sec - INTEGER
+	Limit maximal number of ICMP packets sent per second from this host.
+	Only messages whose type matches icmp_ratemask (see below) are
+	controlled by this limit.
+
+	Default: 1000
+
+icmp_msgs_burst - INTEGER
+	icmp_msgs_per_sec controls number of ICMP packets sent per second,
+	while icmp_msgs_burst controls the burst size of these packets.
+
+	Default: 50
+
+icmp_ratemask - INTEGER
+	Mask made of ICMP types for which rates are being limited.
+
+	Significant bits: IHGFEDCBA9876543210
+
+	Default mask:     0000001100000011000 (6168)
+
+	Bit definitions (see include/linux/icmp.h):
+
+		= =========================
+		0 Echo Reply
+		3 Destination Unreachable [1]_
+		4 Source Quench [1]_
+		5 Redirect
+		8 Echo Request
+		B Time Exceeded [1]_
+		C Parameter Problem [1]_
+		D Timestamp Request
+		E Timestamp Reply
+		F Info Request
+		G Info Reply
+		H Address Mask Request
+		I Address Mask Reply
+		= =========================
+
+	.. [1] These are rate limited by default (see default mask above)
+
+icmp_ignore_bogus_error_responses - BOOLEAN
+	Some routers violate RFC1122 by sending bogus responses to broadcast
+	frames.  Such violations are normally logged via a kernel warning.
+	If this is set to TRUE, the kernel will not give such warnings, which
+	will avoid log file clutter.
+
+	Default: 1
+
+icmp_errors_use_inbound_ifaddr - BOOLEAN
+
+	If zero, icmp error messages are sent with the primary address of
+	the exiting interface.
+
+	If non-zero, the message will be sent with the primary address of
+	the interface that received the packet that caused the icmp error.
+	This is the behaviour network many administrators will expect from
+	a router. And it can make debugging complicated network layouts
+	much easier.
+
+	Note that if no primary address exists for the interface selected,
+	then the primary address of the first non-loopback interface that
+	has one will be used regardless of this setting.
+
+	Default: 0
+
+igmp_max_memberships - INTEGER
+	Change the maximum number of multicast groups we can subscribe to.
+	Default: 20
+
+	Theoretical maximum value is bounded by having to send a membership
+	report in a single datagram (i.e. the report can't span multiple
+	datagrams, or risk confusing the switch and leaving groups you don't
+	intend to).
+
+	The number of supported groups 'M' is bounded by the number of group
+	report entries you can fit into a single datagram of 65535 bytes.
+
+	M = 65536-sizeof (ip header)/(sizeof(Group record))
+
+	Group records are variable length, with a minimum of 12 bytes.
+	So net.ipv4.igmp_max_memberships should not be set higher than:
+
+	(65536-24) / 12 = 5459
+
+	The value 5459 assumes no IP header options, so in practice
+	this number may be lower.
+
+igmp_max_msf - INTEGER
+	Maximum number of addresses allowed in the source filter list for a
+	multicast group.
+
+	Default: 10
+
+igmp_qrv - INTEGER
+	Controls the IGMP query robustness variable (see RFC2236 8.1).
+
+	Default: 2 (as specified by RFC2236 8.1)
+
+	Minimum: 1 (as specified by RFC6636 4.5)
+
+force_igmp_version - INTEGER
+	- 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback
+	  allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier
+	  Present timer expires.
+	- 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if
+	  receive IGMPv2/v3 query.
+	- 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive
+	  IGMPv1 query message. Will reply report if receive IGMPv3 query.
+	- 3 - Enforce to use IGMP version 3. The same react with default 0.
+
+	.. note::
+
+	   this is not the same with force_mld_version because IGMPv3 RFC3376
+	   Security Considerations does not have clear description that we could
+	   ignore other version messages completely as MLDv2 RFC3810. So make
+	   this value as default 0 is recommended.
+
+``conf/interface/*``
+	changes special settings per interface (where
+	interface" is the name of your network interface)
+
+``conf/all/*``
+	  is special, changes the settings for all interfaces
+
+log_martians - BOOLEAN
+	Log packets with impossible addresses to kernel log.
+	log_martians for the interface will be enabled if at least one of
+	conf/{all,interface}/log_martians is set to TRUE,
+	it will be disabled otherwise
+
+accept_redirects - BOOLEAN
+	Accept ICMP redirect messages.
+	accept_redirects for the interface will be enabled if:
+
+	- both conf/{all,interface}/accept_redirects are TRUE in the case
+	  forwarding for the interface is enabled
+
+	or
+
+	- at least one of conf/{all,interface}/accept_redirects is TRUE in the
+	  case forwarding for the interface is disabled
+
+	accept_redirects for the interface will be disabled otherwise
+
+	default:
+
+		- TRUE (host)
+		- FALSE (router)
+
+forwarding - BOOLEAN
+	Enable IP forwarding on this interface.  This controls whether packets
+	received _on_ this interface can be forwarded.
+
+mc_forwarding - BOOLEAN
+	Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
+	and a multicast routing daemon is required.
+	conf/all/mc_forwarding must also be set to TRUE to enable multicast
+	routing	for the interface
+
+medium_id - INTEGER
+	Integer value used to differentiate the devices by the medium they
+	are attached to. Two devices can have different id values when
+	the broadcast packets are received only on one of them.
+	The default value 0 means that the device is the only interface
+	to its medium, value of -1 means that medium is not known.
+
+	Currently, it is used to change the proxy_arp behavior:
+	the proxy_arp feature is enabled for packets forwarded between
+	two devices attached to different media.
+
+proxy_arp - BOOLEAN
+	Do proxy arp.
+
+	proxy_arp for the interface will be enabled if at least one of
+	conf/{all,interface}/proxy_arp is set to TRUE,
+	it will be disabled otherwise
+
+proxy_arp_pvlan - BOOLEAN
+	Private VLAN proxy arp.
+
+	Basically allow proxy arp replies back to the same interface
+	(from which the ARP request/solicitation was received).
+
+	This is done to support (ethernet) switch features, like RFC
+	3069, where the individual ports are NOT allowed to
+	communicate with each other, but they are allowed to talk to
+	the upstream router.  As described in RFC 3069, it is possible
+	to allow these hosts to communicate through the upstream
+	router by proxy_arp'ing. Don't need to be used together with
+	proxy_arp.
+
+	This technology is known by different names:
+
+	  In RFC 3069 it is called VLAN Aggregation.
+	  Cisco and Allied Telesyn call it Private VLAN.
+	  Hewlett-Packard call it Source-Port filtering or port-isolation.
+	  Ericsson call it MAC-Forced Forwarding (RFC Draft).
+
+shared_media - BOOLEAN
+	Send(router) or accept(host) RFC1620 shared media redirects.
+	Overrides secure_redirects.
+
+	shared_media for the interface will be enabled if at least one of
+	conf/{all,interface}/shared_media is set to TRUE,
+	it will be disabled otherwise
+
+	default TRUE
+
+secure_redirects - BOOLEAN
+	Accept ICMP redirect messages only to gateways listed in the
+	interface's current gateway list. Even if disabled, RFC1122 redirect
+	rules still apply.
+
+	Overridden by shared_media.
+
+	secure_redirects for the interface will be enabled if at least one of
+	conf/{all,interface}/secure_redirects is set to TRUE,
+	it will be disabled otherwise
+
+	default TRUE
+
+send_redirects - BOOLEAN
+	Send redirects, if router.
+
+	send_redirects for the interface will be enabled if at least one of
+	conf/{all,interface}/send_redirects is set to TRUE,
+	it will be disabled otherwise
+
+	Default: TRUE
+
+bootp_relay - BOOLEAN
+	Accept packets with source address 0.b.c.d destined
+	not to this host as local ones. It is supposed, that
+	BOOTP relay daemon will catch and forward such packets.
+	conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay
+	for the interface
+
+	default FALSE
+
+	Not Implemented Yet.
+
+accept_source_route - BOOLEAN
+	Accept packets with SRR option.
+	conf/all/accept_source_route must also be set to TRUE to accept packets
+	with SRR option on the interface
+
+	default
+
+		- TRUE (router)
+		- FALSE (host)
+
+accept_local - BOOLEAN
+	Accept packets with local source addresses. In combination with
+	suitable routing, this can be used to direct packets between two
+	local interfaces over the wire and have them accepted properly.
+	default FALSE
+
+route_localnet - BOOLEAN
+	Do not consider loopback addresses as martian source or destination
+	while routing. This enables the use of 127/8 for local routing purposes.
+
+	default FALSE
+
+rp_filter - INTEGER
+	- 0 - No source validation.
+	- 1 - Strict mode as defined in RFC3704 Strict Reverse Path
+	  Each incoming packet is tested against the FIB and if the interface
+	  is not the best reverse path the packet check will fail.
+	  By default failed packets are discarded.
+	- 2 - Loose mode as defined in RFC3704 Loose Reverse Path
+	  Each incoming packet's source address is also tested against the FIB
+	  and if the source address is not reachable via any interface
+	  the packet check will fail.
+
+	Current recommended practice in RFC3704 is to enable strict mode
+	to prevent IP spoofing from DDos attacks. If using asymmetric routing
+	or other complicated routing, then loose mode is recommended.
+
+	The max value from conf/{all,interface}/rp_filter is used
+	when doing source validation on the {interface}.
+
+	Default value is 0. Note that some distributions enable it
+	in startup scripts.
+
+arp_filter - BOOLEAN
+	- 1 - Allows you to have multiple network interfaces on the same
+	  subnet, and have the ARPs for each interface be answered
+	  based on whether or not the kernel would route a packet from
+	  the ARP'd IP out that interface (therefore you must use source
+	  based routing for this to work). In other words it allows control
+	  of which cards (usually 1) will respond to an arp request.
+
+	- 0 - (default) The kernel can respond to arp requests with addresses
+	  from other interfaces. This may seem wrong but it usually makes
+	  sense, because it increases the chance of successful communication.
+	  IP addresses are owned by the complete host on Linux, not by
+	  particular interfaces. Only for more complex setups like load-
+	  balancing, does this behaviour cause problems.
+
+	arp_filter for the interface will be enabled if at least one of
+	conf/{all,interface}/arp_filter is set to TRUE,
+	it will be disabled otherwise
+
+arp_announce - INTEGER
+	Define different restriction levels for announcing the local
+	source IP address from IP packets in ARP requests sent on
+	interface:
+
+	- 0 - (default) Use any local address, configured on any interface
+	- 1 - Try to avoid local addresses that are not in the target's
+	  subnet for this interface. This mode is useful when target
+	  hosts reachable via this interface require the source IP
+	  address in ARP requests to be part of their logical network
+	  configured on the receiving interface. When we generate the
+	  request we will check all our subnets that include the
+	  target IP and will preserve the source address if it is from
+	  such subnet. If there is no such subnet we select source
+	  address according to the rules for level 2.
+	- 2 - Always use the best local address for this target.
+	  In this mode we ignore the source address in the IP packet
+	  and try to select local address that we prefer for talks with
+	  the target host. Such local address is selected by looking
+	  for primary IP addresses on all our subnets on the outgoing
+	  interface that include the target IP address. If no suitable
+	  local address is found we select the first local address
+	  we have on the outgoing interface or on all other interfaces,
+	  with the hope we will receive reply for our request and
+	  even sometimes no matter the source IP address we announce.
+
+	The max value from conf/{all,interface}/arp_announce is used.
+
+	Increasing the restriction level gives more chance for
+	receiving answer from the resolved target while decreasing
+	the level announces more valid sender's information.
+
+arp_ignore - INTEGER
+	Define different modes for sending replies in response to
+	received ARP requests that resolve local target IP addresses:
+
+	- 0 - (default): reply for any local target IP address, configured
+	  on any interface
+	- 1 - reply only if the target IP address is local address
+	  configured on the incoming interface
+	- 2 - reply only if the target IP address is local address
+	  configured on the incoming interface and both with the
+	  sender's IP address are part from same subnet on this interface
+	- 3 - do not reply for local addresses configured with scope host,
+	  only resolutions for global and link addresses are replied
+	- 4-7 - reserved
+	- 8 - do not reply for all local addresses
+
+	The max value from conf/{all,interface}/arp_ignore is used
+	when ARP request is received on the {interface}
+
+arp_notify - BOOLEAN
+	Define mode for notification of address and device changes.
+
+	 ==  ==========================================================
+	  0  (default): do nothing
+	  1  Generate gratuitous arp requests when device is brought up
+	     or hardware address changes.
+	 ==  ==========================================================
+
+arp_accept - BOOLEAN
+	Define behavior for gratuitous ARP frames who's IP is not
+	already present in the ARP table:
+
+	- 0 - don't create new entries in the ARP table
+	- 1 - create new entries in the ARP table
+
+	Both replies and requests type gratuitous arp will trigger the
+	ARP table to be updated, if this setting is on.
+
+	If the ARP table already contains the IP address of the
+	gratuitous arp frame, the arp table will be updated regardless
+	if this setting is on or off.
+
+mcast_solicit - INTEGER
+	The maximum number of multicast probes in INCOMPLETE state,
+	when the associated hardware address is unknown.  Defaults
+	to 3.
+
+ucast_solicit - INTEGER
+	The maximum number of unicast probes in PROBE state, when
+	the hardware address is being reconfirmed.  Defaults to 3.
+
+app_solicit - INTEGER
+	The maximum number of probes to send to the user space ARP daemon
+	via netlink before dropping back to multicast probes (see
+	mcast_resolicit).  Defaults to 0.
+
+mcast_resolicit - INTEGER
+	The maximum number of multicast probes after unicast and
+	app probes in PROBE state.  Defaults to 0.
+
+disable_policy - BOOLEAN
+	Disable IPSEC policy (SPD) for this interface
+
+disable_xfrm - BOOLEAN
+	Disable IPSEC encryption on this interface, whatever the policy
+
+igmpv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv1 or IGMPv2 report retransmit will take place.
+
+	Default: 10000 (10 seconds)
+
+igmpv3_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv3 report retransmit will take place.
+
+	Default: 1000 (1 seconds)
+
+promote_secondaries - BOOLEAN
+	When a primary IP address is removed from this interface
+	promote a corresponding secondary IP address instead of
+	removing all the corresponding secondary IP addresses.
+
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IP packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	This behavior (for multicast) is actually a SHOULD in RFC
+	1122, but is disabled by default for compatibility reasons.
+
+	Default: off (0)
+
+drop_gratuitous_arp - BOOLEAN
+	Drop all gratuitous ARP frames, for example if there's a known
+	good ARP proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	Default: off (0)
+
+
+tag - INTEGER
+	Allows you to write a number, which can be used as required.
+
+	Default value is 0.
+
+xfrm4_gc_thresh - INTEGER
+	(Obsolete since linux-4.14)
+	The threshold at which we will start garbage collecting for IPv4
+	destination cache entries.  At twice this value the system will
+	refuse new allocations.
+
+igmp_link_local_mcast_reports - BOOLEAN
+	Enable IGMP reports for link local multicast groups in the
+	224.0.0.X range.
+
+	Default TRUE
+
+Alexey Kuznetsov.
+kuznet@ms2.inr.ac.ru
+
+Updated by:
+
+- Andi Kleen
+  ak@muc.de
+- Nicolas Delon
+  delon.nicolas@wanadoo.fr
+
+
+
+
+/proc/sys/net/ipv6/* Variables
+==============================
+
+IPv6 has no global variables such as tcp_*.  tcp_* settings under ipv4/ also
+apply to IPv6 [XXX?].
+
+bindv6only - BOOLEAN
+	Default value for IPV6_V6ONLY socket option,
+	which restricts use of the IPv6 socket to IPv6 communication
+	only.
+
+		- TRUE: disable IPv4-mapped address feature
+		- FALSE: enable IPv4-mapped address feature
+
+	Default: FALSE (as specified in RFC3493)
+
+flowlabel_consistency - BOOLEAN
+	Protect the consistency (and unicity) of flow label.
+	You have to disable it to use IPV6_FL_F_REFLECT flag on the
+	flow label manager.
+
+	- TRUE: enabled
+	- FALSE: disabled
+
+	Default: TRUE
+
+auto_flowlabels - INTEGER
+	Automatically generate flow labels based on a flow hash of the
+	packet. This allows intermediate devices, such as routers, to
+	identify packet flows for mechanisms like Equal Cost Multipath
+	Routing (see RFC 6438).
+
+	=  ===========================================================
+	0  automatic flow labels are completely disabled
+	1  automatic flow labels are enabled by default, they can be
+	   disabled on a per socket basis using the IPV6_AUTOFLOWLABEL
+	   socket option
+	2  automatic flow labels are allowed, they may be enabled on a
+	   per socket basis using the IPV6_AUTOFLOWLABEL socket option
+	3  automatic flow labels are enabled and enforced, they cannot
+	   be disabled by the socket option
+	=  ===========================================================
+
+	Default: 1
+
+flowlabel_state_ranges - BOOLEAN
+	Split the flow label number space into two ranges. 0-0x7FFFF is
+	reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF
+	is reserved for stateless flow labels as described in RFC6437.
+
+	- TRUE: enabled
+	- FALSE: disabled
+
+	Default: true
+
+flowlabel_reflect - INTEGER
+	Control flow label reflection. Needed for Path MTU
+	Discovery to work with Equal Cost Multipath Routing in anycast
+	environments. See RFC 7690 and:
+	https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
+
+	This is a bitmask.
+
+	- 1: enabled for established flows
+
+	  Note that this prevents automatic flowlabel changes, as done
+	  in "tcp: change IPv6 flow-label upon receiving spurious retransmission"
+	  and "tcp: Change txhash on every SYN and RTO retransmit"
+
+	- 2: enabled for TCP RESET packets (no active listener)
+	  If set, a RST packet sent in response to a SYN packet on a closed
+	  port will reflect the incoming flow label.
+
+	- 4: enabled for ICMPv6 echo reply messages.
+
+	Default: 0
+
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes.
+
+	Default: 0 (Layer 3)
+
+	Possible values:
+
+	- 0 - Layer 3 (source and destination addresses plus flow label)
+	- 1 - Layer 4 (standard 5-tuple)
+	- 2 - Layer 3 or inner Layer 3 if present
+
+anycast_src_echo_reply - BOOLEAN
+	Controls the use of anycast addresses as source addresses for ICMPv6
+	echo reply
+
+	- TRUE:  enabled
+	- FALSE: disabled
+
+	Default: FALSE
+
+idgen_delay - INTEGER
+	Controls the delay in seconds after which time to retry
+	privacy stable address generation if a DAD conflict is
+	detected.
+
+	Default: 1 (as specified in RFC7217)
+
+idgen_retries - INTEGER
+	Controls the number of retries to generate a stable privacy
+	address if a DAD conflict is detected.
+
+	Default: 3 (as specified in RFC7217)
+
+mld_qrv - INTEGER
+	Controls the MLD query robustness variable (see RFC3810 9.1).
+
+	Default: 2 (as specified by RFC3810 9.1)
+
+	Minimum: 1 (as specified by RFC6636 4.5)
+
+max_dst_opts_number - INTEGER
+	Maximum number of non-padding TLVs allowed in a Destination
+	options extension header. If this value is less than zero
+	then unknown options are disallowed and the number of known
+	TLVs allowed is the absolute value of this number.
+
+	Default: 8
+
+max_hbh_opts_number - INTEGER
+	Maximum number of non-padding TLVs allowed in a Hop-by-Hop
+	options extension header. If this value is less than zero
+	then unknown options are disallowed and the number of known
+	TLVs allowed is the absolute value of this number.
+
+	Default: 8
+
+max_dst_opts_length - INTEGER
+	Maximum length allowed for a Destination options extension
+	header.
+
+	Default: INT_MAX (unlimited)
+
+max_hbh_length - INTEGER
+	Maximum length allowed for a Hop-by-Hop options extension
+	header.
+
+	Default: INT_MAX (unlimited)
+
+skip_notify_on_dev_down - BOOLEAN
+	Controls whether an RTM_DELROUTE message is generated for routes
+	removed when a device is taken down or deleted. IPv4 does not
+	generate this message; IPv6 does by default. Setting this sysctl
+	to true skips the message, making IPv4 and IPv6 on par in relying
+	on userspace caches to track link events and evict routes.
+
+	Default: false (generate message)
+
+nexthop_compat_mode - BOOLEAN
+	New nexthop API provides a means for managing nexthops independent of
+	prefixes. Backwards compatibilty with old route format is enabled by
+	default which means route dumps and notifications contain the new
+	nexthop attribute but also the full, expanded nexthop definition.
+	Further, updates or deletes of a nexthop configuration generate route
+	notifications for each fib entry using the nexthop. Once a system
+	understands the new API, this sysctl can be disabled to achieve full
+	performance benefits of the new API by disabling the nexthop expansion
+	and extraneous notifications.
+	Default: true (backward compat mode)
+
+IPv6 Fragmentation:
+
+ip6frag_high_thresh - INTEGER
+	Maximum memory used to reassemble IPv6 fragments. When
+	ip6frag_high_thresh bytes of memory is allocated for this purpose,
+	the fragment handler will toss packets until ip6frag_low_thresh
+	is reached.
+
+ip6frag_low_thresh - INTEGER
+	See ip6frag_high_thresh
+
+ip6frag_time - INTEGER
+	Time in seconds to keep an IPv6 fragment in memory.
+
+IPv6 Segment Routing:
+
+seg6_flowlabel - INTEGER
+	Controls the behaviour of computing the flowlabel of outer
+	IPv6 header in case of SR T.encaps
+
+	 == =======================================================
+	 -1  set flowlabel to zero.
+	  0  copy flowlabel from Inner packet in case of Inner IPv6
+	     (Set flowlabel to 0 in case IPv4/L2)
+	  1  Compute the flowlabel using seg6_make_flowlabel()
+	 == =======================================================
+
+	Default is 0.
+
+``conf/default/*``:
+	Change the interface-specific default settings.
+
+
+``conf/all/*``:
+	Change all the interface-specific settings.
+
+	[XXX:  Other special features than forwarding?]
+
+conf/all/forwarding - BOOLEAN
+	Enable global IPv6 forwarding between all interfaces.
+
+	IPv4 and IPv6 work differently here; e.g. netfilter must be used
+	to control which interfaces may forward packets and which not.
+
+	This also sets all interfaces' Host/Router setting
+	'forwarding' to the specified value.  See below for details.
+
+	This referred to as global forwarding.
+
+proxy_ndp - BOOLEAN
+	Do proxy ndp.
+
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv6 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+
+	Default: 0
+
+``conf/interface/*``:
+	Change special settings per interface.
+
+	The functional behaviour for certain settings is different
+	depending on whether local forwarding is enabled or not.
+
+accept_ra - INTEGER
+	Accept Router Advertisements; autoconfigure using them.
+
+	It also determines whether or not to transmit Router
+	Solicitations. If and only if the functional setting is to
+	accept Router Advertisements, Router Solicitations will be
+	transmitted.
+
+	Possible values are:
+
+		==  ===========================================================
+		 0  Do not accept Router Advertisements.
+		 1  Accept Router Advertisements if forwarding is disabled.
+		 2  Overrule forwarding behaviour. Accept Router Advertisements
+		    even if forwarding is enabled.
+		==  ===========================================================
+
+	Functional default:
+
+		- enabled if local forwarding is disabled.
+		- disabled if local forwarding is enabled.
+
+accept_ra_defrtr - BOOLEAN
+	Learn default router in Router Advertisement.
+
+	Functional default:
+
+		- enabled if accept_ra is enabled.
+		- disabled if accept_ra is disabled.
+
+accept_ra_from_local - BOOLEAN
+	Accept RA with source-address that is found on local machine
+	if the RA is otherwise proper and able to be accepted.
+
+	Default is to NOT accept these as it may be an un-intended
+	network loop.
+
+	Functional default:
+
+	   - enabled if accept_ra_from_local is enabled
+	     on a specific interface.
+	   - disabled if accept_ra_from_local is disabled
+	     on a specific interface.
+
+accept_ra_min_hop_limit - INTEGER
+	Minimum hop limit Information in Router Advertisement.
+
+	Hop limit Information in Router Advertisement less than this
+	variable shall be ignored.
+
+	Default: 1
+
+accept_ra_pinfo - BOOLEAN
+	Learn Prefix Information in Router Advertisement.
+
+	Functional default:
+
+		- enabled if accept_ra is enabled.
+		- disabled if accept_ra is disabled.
+
+accept_ra_rt_info_min_plen - INTEGER
+	Minimum prefix length of Route Information in RA.
+
+	Route Information w/ prefix smaller than this variable shall
+	be ignored.
+
+	Functional default:
+
+		* 0 if accept_ra_rtr_pref is enabled.
+		* -1 if accept_ra_rtr_pref is disabled.
+
+accept_ra_rt_info_max_plen - INTEGER
+	Maximum prefix length of Route Information in RA.
+
+	Route Information w/ prefix larger than this variable shall
+	be ignored.
+
+	Functional default:
+
+		* 0 if accept_ra_rtr_pref is enabled.
+		* -1 if accept_ra_rtr_pref is disabled.
+
+accept_ra_rtr_pref - BOOLEAN
+	Accept Router Preference in RA.
+
+	Functional default:
+
+		- enabled if accept_ra is enabled.
+		- disabled if accept_ra is disabled.
+
+accept_ra_mtu - BOOLEAN
+	Apply the MTU value specified in RA option 5 (RFC4861). If
+	disabled, the MTU specified in the RA will be ignored.
+
+	Functional default:
+
+		- enabled if accept_ra is enabled.
+		- disabled if accept_ra is disabled.
+
+accept_redirects - BOOLEAN
+	Accept Redirects.
+
+	Functional default:
+
+		- enabled if local forwarding is disabled.
+		- disabled if local forwarding is enabled.
+
+accept_source_route - INTEGER
+	Accept source routing (routing extension header).
+
+	- >= 0: Accept only routing header type 2.
+	- < 0: Do not accept routing header.
+
+	Default: 0
+
+autoconf - BOOLEAN
+	Autoconfigure addresses using Prefix Information in Router
+	Advertisements.
+
+	Functional default:
+
+		- enabled if accept_ra_pinfo is enabled.
+		- disabled if accept_ra_pinfo is disabled.
+
+dad_transmits - INTEGER
+	The amount of Duplicate Address Detection probes to send.
+
+	Default: 1
+
+forwarding - INTEGER
+	Configure interface-specific Host/Router behaviour.
+
+	.. note::
+
+	   It is recommended to have the same setting on all
+	   interfaces; mixed router/host scenarios are rather uncommon.
+
+	Possible values are:
+
+		- 0 Forwarding disabled
+		- 1 Forwarding enabled
+
+	**FALSE (0)**:
+
+	By default, Host behaviour is assumed.  This means:
+
+	1. IsRouter flag is not set in Neighbour Advertisements.
+	2. If accept_ra is TRUE (default), transmit Router
+	   Solicitations.
+	3. If accept_ra is TRUE (default), accept Router
+	   Advertisements (and do autoconfiguration).
+	4. If accept_redirects is TRUE (default), accept Redirects.
+
+	**TRUE (1)**:
+
+	If local forwarding is enabled, Router behaviour is assumed.
+	This means exactly the reverse from the above:
+
+	1. IsRouter flag is set in Neighbour Advertisements.
+	2. Router Solicitations are not sent unless accept_ra is 2.
+	3. Router Advertisements are ignored unless accept_ra is 2.
+	4. Redirects are ignored.
+
+	Default: 0 (disabled) if global forwarding is disabled (default),
+	otherwise 1 (enabled).
+
+hop_limit - INTEGER
+	Default Hop Limit to set.
+
+	Default: 64
+
+mtu - INTEGER
+	Default Maximum Transfer Unit
+
+	Default: 1280 (IPv6 required minimum)
+
+ip_nonlocal_bind - BOOLEAN
+	If set, allows processes to bind() to non-local IPv6 addresses,
+	which can be quite useful - but may break some applications.
+
+	Default: 0
+
+router_probe_interval - INTEGER
+	Minimum interval (in seconds) between Router Probing described
+	in RFC4191.
+
+	Default: 60
+
+router_solicitation_delay - INTEGER
+	Number of seconds to wait after interface is brought up
+	before sending Router Solicitations.
+
+	Default: 1
+
+router_solicitation_interval - INTEGER
+	Number of seconds to wait between Router Solicitations.
+
+	Default: 4
+
+router_solicitations - INTEGER
+	Number of Router Solicitations to send until assuming no
+	routers are present.
+
+	Default: 3
+
+use_oif_addrs_only - BOOLEAN
+	When enabled, the candidate source addresses for destinations
+	routed via this interface are restricted to the set of addresses
+	configured on this interface (vis. RFC 6724, section 4).
+
+	Default: false
+
+use_tempaddr - INTEGER
+	Preference for Privacy Extensions (RFC3041).
+
+	  * <= 0 : disable Privacy Extensions
+	  * == 1 : enable Privacy Extensions, but prefer public
+	    addresses over temporary addresses.
+	  * >  1 : enable Privacy Extensions and prefer temporary
+	    addresses over public addresses.
+
+	Default:
+
+		* 0 (for most devices)
+		* -1 (for point-to-point devices and loopback devices)
+
+temp_valid_lft - INTEGER
+	valid lifetime (in seconds) for temporary addresses.
+
+	Default: 604800 (7 days)
+
+temp_prefered_lft - INTEGER
+	Preferred lifetime (in seconds) for temporary addresses.
+
+	Default: 86400 (1 day)
+
+keep_addr_on_down - INTEGER
+	Keep all IPv6 addresses on an interface down event. If set static
+	global addresses with no expiration time are not flushed.
+
+	*   >0 : enabled
+	*    0 : system default
+	*   <0 : disabled
+
+	Default: 0 (addresses are removed)
+
+max_desync_factor - INTEGER
+	Maximum value for DESYNC_FACTOR, which is a random value
+	that ensures that clients don't synchronize with each
+	other and generate new addresses at exactly the same time.
+	value is in seconds.
+
+	Default: 600
+
+regen_max_retry - INTEGER
+	Number of attempts before give up attempting to generate
+	valid temporary addresses.
+
+	Default: 5
+
+max_addresses - INTEGER
+	Maximum number of autoconfigured addresses per interface.  Setting
+	to zero disables the limitation.  It is not recommended to set this
+	value too large (or to zero) because it would be an easy way to
+	crash the kernel by allowing too many addresses to be created.
+
+	Default: 16
+
+disable_ipv6 - BOOLEAN
+	Disable IPv6 operation.  If accept_dad is set to 2, this value
+	will be dynamically set to TRUE if DAD fails for the link-local
+	address.
+
+	Default: FALSE (enable IPv6 operation)
+
+	When this value is changed from 1 to 0 (IPv6 is being enabled),
+	it will dynamically create a link-local address on the given
+	interface and start Duplicate Address Detection, if necessary.
+
+	When this value is changed from 0 to 1 (IPv6 is being disabled),
+	it will dynamically delete all addresses and routes on the given
+	interface. From now on it will not possible to add addresses/routes
+	to the selected interface.
+
+accept_dad - INTEGER
+	Whether to accept DAD (Duplicate Address Detection).
+
+	 == ==============================================================
+	  0  Disable DAD
+	  1  Enable DAD (default)
+	  2  Enable DAD, and disable IPv6 operation if MAC-based duplicate
+	     link-local address has been found.
+	 == ==============================================================
+
+	DAD operation and mode on a given interface will be selected according
+	to the maximum value of conf/{all,interface}/accept_dad.
+
+force_tllao - BOOLEAN
+	Enable sending the target link-layer address option even when
+	responding to a unicast neighbor solicitation.
+
+	Default: FALSE
+
+	Quoting from RFC 2461, section 4.4, Target link-layer address:
+
+	"The option MUST be included for multicast solicitations in order to
+	avoid infinite Neighbor Solicitation "recursion" when the peer node
+	does not have a cache entry to return a Neighbor Advertisements
+	message.  When responding to unicast solicitations, the option can be
+	omitted since the sender of the solicitation has the correct link-
+	layer address; otherwise it would not have be able to send the unicast
+	solicitation in the first place. However, including the link-layer
+	address in this case adds little overhead and eliminates a potential
+	race condition where the sender deletes the cached link-layer address
+	prior to receiving a response to a previous solicitation."
+
+ndisc_notify - BOOLEAN
+	Define mode for notification of address and device changes.
+
+	* 0 - (default): do nothing
+	* 1 - Generate unsolicited neighbour advertisements when device is brought
+	  up or hardware address changes.
+
+ndisc_tclass - INTEGER
+	The IPv6 Traffic Class to use by default when sending IPv6 Neighbor
+	Discovery (Router Solicitation, Router Advertisement, Neighbor
+	Solicitation, Neighbor Advertisement, Redirect) messages.
+	These 8 bits can be interpreted as 6 high order bits holding the DSCP
+	value and 2 low order bits representing ECN (which you probably want
+	to leave cleared).
+
+	* 0 - (default)
+
+mldv1_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv1 report retransmit will take place.
+
+	Default: 10000 (10 seconds)
+
+mldv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv2 report retransmit will take place.
+
+	Default: 1000 (1 second)
+
+force_mld_version - INTEGER
+	* 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed
+	* 1 - Enforce to use MLD version 1
+	* 2 - Enforce to use MLD version 2
+
+suppress_frag_ndisc - INTEGER
+	Control RFC 6980 (Security Implications of IPv6 Fragmentation
+	with IPv6 Neighbor Discovery) behavior:
+
+	* 1 - (default) discard fragmented neighbor discovery packets
+	* 0 - allow fragmented neighbor discovery packets
+
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+
+	* 0: disabled (default)
+	* 1: enabled
+
+	Optimistic Duplicate Address Detection for the interface will be enabled
+	if at least one of conf/{all,interface}/optimistic_dad is set to 1,
+	it will be disabled otherwise.
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+
+	* 0: disabled (default)
+	* 1: enabled
+
+	This will be enabled if at least one of
+	conf/{all,interface}/use_optimistic is set to 1, disabled otherwise.
+
+stable_secret - IPv6 address
+	This IPv6 address will be used as a secret to generate IPv6
+	addresses for link-local addresses and autoconfigured
+	ones. All addresses generated after setting this secret will
+	be stable privacy ones by default. This can be changed via the
+	addrgenmode ip-link. conf/default/stable_secret is used as the
+	secret for the namespace, the interface specific ones can
+	overwrite that. Writes to conf/all/stable_secret are refused.
+
+	It is recommended to generate this secret during installation
+	of a system and keep it stable after that.
+
+	By default the stable secret is unset.
+
+addr_gen_mode - INTEGER
+	Defines how link-local and autoconf addresses are generated.
+
+	=  =================================================================
+	0  generate address based on EUI64 (default)
+	1  do no generate a link-local address, use EUI64 for addresses
+	   generated from autoconf
+	2  generate stable privacy addresses, using the secret from
+	   stable_secret (RFC7217)
+	3  generate stable privacy addresses, using a random secret if unset
+	=  =================================================================
+
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IPv6 packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	By default this is turned off.
+
+drop_unsolicited_na - BOOLEAN
+	Drop all unsolicited neighbor advertisements, for example if there's
+	a known good NA proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	By default this is turned off.
+
+enhanced_dad - BOOLEAN
+	Include a nonce option in the IPv6 neighbor solicitation messages used for
+	duplicate address detection per RFC7527. A received DAD NS will only signal
+	a duplicate address if the nonce is different. This avoids any false
+	detection of duplicates due to loopback of the NS messages that we send.
+	The nonce option will be sent on an interface unless both of
+	conf/{all,interface}/enhanced_dad are set to FALSE.
+
+	Default: TRUE
+
+``icmp/*``:
+===========
+
+ratelimit - INTEGER
+	Limit the maximal rates for sending ICMPv6 messages.
+
+	0 to disable any limiting,
+	otherwise the minimal space between responses in milliseconds.
+
+	Default: 1000
+
+ratemask - list of comma separated ranges
+	For ICMPv6 message types matching the ranges in the ratemask, limit
+	the sending of the message according to ratelimit parameter.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and
+	129). Writing to the file will clear all previous ranges of ICMPv6
+	message types and update the current list with the input.
+
+	Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml
+	for numerical values of ICMPv6 message types, e.g. echo request is 128
+	and echo reply is 129.
+
+	Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big)
+
+echo_ignore_all - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO
+	requests sent to it over the IPv6 protocol.
+
+	Default: 0
+
+echo_ignore_multicast - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO
+	requests sent to it over the IPv6 protocol via multicast.
+
+	Default: 0
+
+echo_ignore_anycast - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO
+	requests sent to it over the IPv6 protocol destined to anycast address.
+
+	Default: 0
+
+xfrm6_gc_thresh - INTEGER
+	(Obsolete since linux-4.14)
+	The threshold at which we will start garbage collecting for IPv6
+	destination cache entries.  At twice this value the system will
+	refuse new allocations.
+
+
+IPv6 Update by:
+Pekka Savola <pekkas@netcore.fi>
+YOSHIFUJI Hideaki / USAGI Project <yoshfuji@linux-ipv6.org>
+
+
+/proc/sys/net/bridge/* Variables:
+=================================
+
+bridge-nf-call-arptables - BOOLEAN
+	- 1 : pass bridged ARP traffic to arptables' FORWARD chain.
+	- 0 : disable this.
+
+	Default: 1
+
+bridge-nf-call-iptables - BOOLEAN
+	- 1 : pass bridged IPv4 traffic to iptables' chains.
+	- 0 : disable this.
+
+	Default: 1
+
+bridge-nf-call-ip6tables - BOOLEAN
+	- 1 : pass bridged IPv6 traffic to ip6tables' chains.
+	- 0 : disable this.
+
+	Default: 1
+
+bridge-nf-filter-vlan-tagged - BOOLEAN
+	- 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables.
+	- 0 : disable this.
+
+	Default: 0
+
+bridge-nf-filter-pppoe-tagged - BOOLEAN
+	- 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables.
+	- 0 : disable this.
+
+	Default: 0
+
+bridge-nf-pass-vlan-input-dev - BOOLEAN
+	- 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan
+	  interface on the bridge and set the netfilter input device to the
+	  vlan. This allows use of e.g. "iptables -i br0.1" and makes the
+	  REDIRECT target work with vlan-on-top-of-bridge interfaces.  When no
+	  matching vlan interface is found, or this switch is off, the input
+	  device is set to the bridge interface.
+
+	- 0: disable bridge netfilter vlan interface lookup.
+
+	Default: 0
+
+``proc/sys/net/sctp/*`` Variables:
+==================================
+
+addip_enable - BOOLEAN
+	Enable or disable extension of  Dynamic Address Reconfiguration
+	(ADD-IP) functionality specified in RFC5061.  This extension provides
+	the ability to dynamically add and remove new addresses for the SCTP
+	associations.
+
+	1: Enable extension.
+
+	0: Disable extension.
+
+	Default: 0
+
+pf_enable - INTEGER
+	Enable or disable pf (pf is short for potentially failed) state. A value
+	of pf_retrans > path_max_retrans also disables pf state. That is, one of
+	both pf_enable and pf_retrans > path_max_retrans can disable pf state.
+	Since pf_retrans and path_max_retrans can be changed by userspace
+	application, sometimes user expects to disable pf state by the value of
+	pf_retrans > path_max_retrans, but occasionally the value of pf_retrans
+	or path_max_retrans is changed by the user application, this pf state is
+	enabled. As such, it is necessary to add this to dynamically enable
+	and disable pf state. See:
+	https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for
+	details.
+
+	1: Enable pf.
+
+	0: Disable pf.
+
+	Default: 1
+
+pf_expose - INTEGER
+	Unset or enable/disable pf (pf is short for potentially failed) state
+	exposure.  Applications can control the exposure of the PF path state
+	in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO
+	sockopt.   When it's unset, no SCTP_PEER_ADDR_CHANGE event with
+	SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info
+	can be got via SCTP_GET_PEER_ADDR_INFO sockopt;  When it's enabled,
+	a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming
+	SCTP_PF state and a SCTP_PF-state transport info can be got via
+	SCTP_GET_PEER_ADDR_INFO sockopt;  When it's diabled, no
+	SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when
+	trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO
+	sockopt.
+
+	0: Unset pf state exposure, Compatible with old applications.
+
+	1: Disable pf state exposure.
+
+	2: Enable pf state exposure.
+
+	Default: 0
+
+addip_noauth_enable - BOOLEAN
+	Dynamic Address Reconfiguration (ADD-IP) requires the use of
+	authentication to protect the operations of adding or removing new
+	addresses.  This requirement is mandated so that unauthorized hosts
+	would not be able to hijack associations.  However, older
+	implementations may not have implemented this requirement while
+	allowing the ADD-IP extension.  For reasons of interoperability,
+	we provide this variable to control the enforcement of the
+	authentication requirement.
+
+	== ===============================================================
+	1  Allow ADD-IP extension to be used without authentication.  This
+	   should only be set in a closed environment for interoperability
+	   with older implementations.
+
+	0  Enforce the authentication requirement
+	== ===============================================================
+
+	Default: 0
+
+auth_enable - BOOLEAN
+	Enable or disable Authenticated Chunks extension.  This extension
+	provides the ability to send and receive authenticated chunks and is
+	required for secure operation of Dynamic Address Reconfiguration
+	(ADD-IP) extension.
+
+	- 1: Enable this extension.
+	- 0: Disable this extension.
+
+	Default: 0
+
+prsctp_enable - BOOLEAN
+	Enable or disable the Partial Reliability extension (RFC3758) which
+	is used to notify peers that a given DATA should no longer be expected.
+
+	- 1: Enable extension
+	- 0: Disable
+
+	Default: 1
+
+max_burst - INTEGER
+	The limit of the number of new packets that can be initially sent.  It
+	controls how bursty the generated traffic can be.
+
+	Default: 4
+
+association_max_retrans - INTEGER
+	Set the maximum number for retransmissions that an association can
+	attempt deciding that the remote end is unreachable.  If this value
+	is exceeded, the association is terminated.
+
+	Default: 10
+
+max_init_retransmits - INTEGER
+	The maximum number of retransmissions of INIT and COOKIE-ECHO chunks
+	that an association will attempt before declaring the destination
+	unreachable and terminating.
+
+	Default: 8
+
+path_max_retrans - INTEGER
+	The maximum number of retransmissions that will be attempted on a given
+	path.  Once this threshold is exceeded, the path is considered
+	unreachable, and new traffic will use a different path when the
+	association is multihomed.
+
+	Default: 5
+
+pf_retrans - INTEGER
+	The number of retransmissions that will be attempted on a given path
+	before traffic is redirected to an alternate transport (should one
+	exist).  Note this is distinct from path_max_retrans, as a path that
+	passes the pf_retrans threshold can still be used.  Its only
+	deprioritized when a transmission path is selected by the stack.  This
+	setting is primarily used to enable fast failover mechanisms without
+	having to reduce path_max_retrans to a very low value.  See:
+	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+	for details.  Note also that a value of pf_retrans > path_max_retrans
+	disables this feature. Since both pf_retrans and path_max_retrans can
+	be changed by userspace application, a variable pf_enable is used to
+	disable pf state.
+
+	Default: 0
+
+ps_retrans - INTEGER
+	Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming
+	from section-5 "Primary Path Switchover" in rfc7829.  The primary path
+	will be changed to another active path when the path error counter on
+	the old primary path exceeds PSMR, so that "the SCTP sender is allowed
+	to continue data transmission on a new working path even when the old
+	primary destination address becomes active again".   Note this feature
+	is disabled by initializing 'ps_retrans' per netns as 0xffff by default,
+	and its value can't be less than 'pf_retrans' when changing by sysctl.
+
+	Default: 0xffff
+
+rto_initial - INTEGER
+	The initial round trip timeout value in milliseconds that will be used
+	in calculating round trip times.  This is the initial time interval
+	for retransmissions.
+
+	Default: 3000
+
+rto_max - INTEGER
+	The maximum value (in milliseconds) of the round trip timeout.  This
+	is the largest time interval that can elapse between retransmissions.
+
+	Default: 60000
+
+rto_min - INTEGER
+	The minimum value (in milliseconds) of the round trip timeout.  This
+	is the smallest time interval the can elapse between retransmissions.
+
+	Default: 1000
+
+hb_interval - INTEGER
+	The interval (in milliseconds) between HEARTBEAT chunks.  These chunks
+	are sent at the specified interval on idle paths to probe the state of
+	a given path between 2 associations.
+
+	Default: 30000
+
+sack_timeout - INTEGER
+	The amount of time (in milliseconds) that the implementation will wait
+	to send a SACK.
+
+	Default: 200
+
+valid_cookie_life - INTEGER
+	The default lifetime of the SCTP cookie (in milliseconds).  The cookie
+	is used during association establishment.
+
+	Default: 60000
+
+cookie_preserve_enable - BOOLEAN
+	Enable or disable the ability to extend the lifetime of the SCTP cookie
+	that is used during the establishment phase of SCTP association
+
+	- 1: Enable cookie lifetime extension.
+	- 0: Disable
+
+	Default: 1
+
+cookie_hmac_alg - STRING
+	Select the hmac algorithm used when generating the cookie value sent by
+	a listening sctp socket to a connecting client in the INIT-ACK chunk.
+	Valid values are:
+
+	* md5
+	* sha1
+	* none
+
+	Ability to assign md5 or sha1 as the selected alg is predicated on the
+	configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and
+	CONFIG_CRYPTO_SHA1).
+
+	Default: Dependent on configuration.  MD5 if available, else SHA1 if
+	available, else none.
+
+rcvbuf_policy - INTEGER
+	Determines if the receive buffer is attributed to the socket or to
+	association.   SCTP supports the capability to create multiple
+	associations on a single socket.  When using this capability, it is
+	possible that a single stalled association that's buffering a lot
+	of data may block other associations from delivering their data by
+	consuming all of the receive buffer space.  To work around this,
+	the rcvbuf_policy could be set to attribute the receiver buffer space
+	to each association instead of the socket.  This prevents the described
+	blocking.
+
+	- 1: rcvbuf space is per association
+	- 0: rcvbuf space is per socket
+
+	Default: 0
+
+sndbuf_policy - INTEGER
+	Similar to rcvbuf_policy above, this applies to send buffer space.
+
+	- 1: Send buffer is tracked per association
+	- 0: Send buffer is tracked per socket.
+
+	Default: 0
+
+sctp_mem - vector of 3 INTEGERs: min, pressure, max
+	Number of pages allowed for queueing by all SCTP sockets.
+
+	min: Below this number of pages SCTP is not bothered about its
+	memory appetite. When amount of memory allocated by SCTP exceeds
+	this number, SCTP starts to moderate memory usage.
+
+	pressure: This value was introduced to follow format of tcp_mem.
+
+	max: Number of pages allowed for queueing by all SCTP sockets.
+
+	Default is calculated at boot time from amount of available memory.
+
+sctp_rmem - vector of 3 INTEGERs: min, default, max
+	Only the first value ("min") is used, "default" and "max" are
+	ignored.
+
+	min: Minimal size of receive buffer used by SCTP socket.
+	It is guaranteed to each SCTP socket (but not association) even
+	under moderate memory pressure.
+
+	Default: 4K
+
+sctp_wmem  - vector of 3 INTEGERs: min, default, max
+	Currently this tunable has no effect.
+
+addr_scope_policy - INTEGER
+	Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
+
+	- 0   - Disable IPv4 address scoping
+	- 1   - Enable IPv4 address scoping
+	- 2   - Follow draft but allow IPv4 private addresses
+	- 3   - Follow draft but allow IPv4 link local addresses
+
+	Default: 1
+
+
+``/proc/sys/net/core/*``
+========================
+
+	Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries.
+
+
+``/proc/sys/net/unix/*``
+========================
+
+max_dgram_qlen - INTEGER
+	The maximum length of dgram socket receive queue
+
+	Default: 10
+
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
deleted file mode 100644
index 5cdc37c34830..000000000000
--- a/Documentation/networking/ip-sysctl.txt
+++ /dev/null
@@ -1,2374 +0,0 @@
-/proc/sys/net/ipv4/* Variables:
-
-ip_forward - BOOLEAN
-	0 - disabled (default)
-	not 0 - enabled
-
-	Forward Packets between interfaces.
-
-	This variable is special, its change resets all configuration
-	parameters to their default state (RFC1122 for hosts, RFC1812
-	for routers)
-
-ip_default_ttl - INTEGER
-	Default value of TTL field (Time To Live) for outgoing (but not
-	forwarded) IP packets. Should be between 1 and 255 inclusive.
-	Default: 64 (as recommended by RFC1700)
-
-ip_no_pmtu_disc - INTEGER
-	Disable Path MTU Discovery. If enabled in mode 1 and a
-	fragmentation-required ICMP is received, the PMTU to this
-	destination will be set to min_pmtu (see below). You will need
-	to raise min_pmtu to the smallest interface MTU on your system
-	manually if you want to avoid locally generated fragments.
-
-	In mode 2 incoming Path MTU Discovery messages will be
-	discarded. Outgoing frames are handled the same as in mode 1,
-	implicitly setting IP_PMTUDISC_DONT on every created socket.
-
-	Mode 3 is a hardened pmtu discover mode. The kernel will only
-	accept fragmentation-needed errors if the underlying protocol
-	can verify them besides a plain socket lookup. Current
-	protocols for which pmtu events will be honored are TCP, SCTP
-	and DCCP as they verify e.g. the sequence number or the
-	association. This mode should not be enabled globally but is
-	only intended to secure e.g. name servers in namespaces where
-	TCP path mtu must still work but path MTU information of other
-	protocols should be discarded. If enabled globally this mode
-	could break other protocols.
-
-	Possible values: 0-3
-	Default: FALSE
-
-min_pmtu - INTEGER
-	default 552 - minimum discovered Path MTU
-
-ip_forward_use_pmtu - BOOLEAN
-	By default we don't trust protocol path MTUs while forwarding
-	because they could be easily forged and can lead to unwanted
-	fragmentation by the router.
-	You only need to enable this if you have user-space software
-	which tries to discover path mtus by itself and depends on the
-	kernel honoring this information. This is normally not the
-	case.
-	Default: 0 (disabled)
-	Possible values:
-	0 - disabled
-	1 - enabled
-
-fwmark_reflect - BOOLEAN
-	Controls the fwmark of kernel-generated IPv4 reply packets that are not
-	associated with a socket for example, TCP RSTs or ICMP echo replies).
-	If unset, these packets have a fwmark of zero. If set, they have the
-	fwmark of the packet they are replying to.
-	Default: 0
-
-fib_multipath_use_neigh - BOOLEAN
-	Use status of existing neighbor entry when determining nexthop for
-	multipath routes. If disabled, neighbor information is not used and
-	packets could be directed to a failed nexthop. Only valid for kernels
-	built with CONFIG_IP_ROUTE_MULTIPATH enabled.
-	Default: 0 (disabled)
-	Possible values:
-	0 - disabled
-	1 - enabled
-
-fib_multipath_hash_policy - INTEGER
-	Controls which hash policy to use for multipath routes. Only valid
-	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
-	Default: 0 (Layer 3)
-	Possible values:
-	0 - Layer 3
-	1 - Layer 4
-	2 - Layer 3 or inner Layer 3 if present
-
-fib_sync_mem - UNSIGNED INTEGER
-	Amount of dirty memory from fib entries that can be backlogged before
-	synchronize_rcu is forced.
-	  Default: 512kB   Minimum: 64kB   Maximum: 64MB
-
-ip_forward_update_priority - INTEGER
-	Whether to update SKB priority from "TOS" field in IPv4 header after it
-	is forwarded. The new SKB priority is mapped from TOS field value
-	according to an rt_tos2priority table (see e.g. man tc-prio).
-	Default: 1 (Update priority.)
-	Possible values:
-	0 - Do not update priority.
-	1 - Update priority.
-
-route/max_size - INTEGER
-	Maximum number of routes allowed in the kernel.  Increase
-	this when using large numbers of interfaces and/or routes.
-	From linux kernel 3.6 onwards, this is deprecated for ipv4
-	as route cache is no longer used.
-
-neigh/default/gc_thresh1 - INTEGER
-	Minimum number of entries to keep.  Garbage collector will not
-	purge entries if there are fewer than this number.
-	Default: 128
-
-neigh/default/gc_thresh2 - INTEGER
-	Threshold when garbage collector becomes more aggressive about
-	purging entries. Entries older than 5 seconds will be cleared
-	when over this number.
-	Default: 512
-
-neigh/default/gc_thresh3 - INTEGER
-	Maximum number of non-PERMANENT neighbor entries allowed.  Increase
-	this when using large numbers of interfaces and when communicating
-	with large numbers of directly-connected peers.
-	Default: 1024
-
-neigh/default/unres_qlen_bytes - INTEGER
-	The maximum number of bytes which may be used by packets
-	queued for each	unresolved address by other network layers.
-	(added in linux 3.3)
-	Setting negative value is meaningless and will return error.
-	Default: SK_WMEM_MAX, (same as net.core.wmem_default).
-		Exact value depends on architecture and kernel options,
-		but should be enough to allow queuing 256 packets
-		of medium size.
-
-neigh/default/unres_qlen - INTEGER
-	The maximum number of packets which may be queued for each
-	unresolved address by other network layers.
-	(deprecated in linux 3.3) : use unres_qlen_bytes instead.
-	Prior to linux 3.3, the default value is 3 which may cause
-	unexpected packet loss. The current default value is calculated
-	according to default value of unres_qlen_bytes and true size of
-	packet.
-	Default: 101
-
-mtu_expires - INTEGER
-	Time, in seconds, that cached PMTU information is kept.
-
-min_adv_mss - INTEGER
-	The advertised MSS depends on the first hop route MTU, but will
-	never be lower than this setting.
-
-IP Fragmentation:
-
-ipfrag_high_thresh - LONG INTEGER
-	Maximum memory used to reassemble IP fragments.
-
-ipfrag_low_thresh - LONG INTEGER
-	(Obsolete since linux-4.17)
-	Maximum memory used to reassemble IP fragments before the kernel
-	begins to remove incomplete fragment queues to free up resources.
-	The kernel still accepts new fragments for defragmentation.
-
-ipfrag_time - INTEGER
-	Time in seconds to keep an IP fragment in memory.
-
-ipfrag_max_dist - INTEGER
-	ipfrag_max_dist is a non-negative integer value which defines the
-	maximum "disorder" which is allowed among fragments which share a
-	common IP source address. Note that reordering of packets is
-	not unusual, but if a large number of fragments arrive from a source
-	IP address while a particular fragment queue remains incomplete, it
-	probably indicates that one or more fragments belonging to that queue
-	have been lost. When ipfrag_max_dist is positive, an additional check
-	is done on fragments before they are added to a reassembly queue - if
-	ipfrag_max_dist (or more) fragments have arrived from a particular IP
-	address between additions to any IP fragment queue using that source
-	address, it's presumed that one or more fragments in the queue are
-	lost. The existing fragment queue will be dropped, and a new one
-	started. An ipfrag_max_dist value of zero disables this check.
-
-	Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
-	result in unnecessarily dropping fragment queues when normal
-	reordering of packets occurs, which could lead to poor application
-	performance. Using a very large value, e.g. 50000, increases the
-	likelihood of incorrectly reassembling IP fragments that originate
-	from different IP datagrams, which could result in data corruption.
-	Default: 64
-
-INET peer storage:
-
-inet_peer_threshold - INTEGER
-	The approximate size of the storage.  Starting from this threshold
-	entries will be thrown aggressively.  This threshold also determines
-	entries' time-to-live and time intervals between garbage collection
-	passes.  More entries, less time-to-live, less GC interval.
-
-inet_peer_minttl - INTEGER
-	Minimum time-to-live of entries.  Should be enough to cover fragment
-	time-to-live on the reassembling side.  This minimum time-to-live  is
-	guaranteed if the pool size is less than inet_peer_threshold.
-	Measured in seconds.
-
-inet_peer_maxttl - INTEGER
-	Maximum time-to-live of entries.  Unused entries will expire after
-	this period of time if there is no memory pressure on the pool (i.e.
-	when the number of entries in the pool is very small).
-	Measured in seconds.
-
-TCP variables:
-
-somaxconn - INTEGER
-	Limit of socket listen() backlog, known in userspace as SOMAXCONN.
-	Defaults to 4096. (Was 128 before linux-5.4)
-	See also tcp_max_syn_backlog for additional tuning for TCP sockets.
-
-tcp_abort_on_overflow - BOOLEAN
-	If listening service is too slow to accept new connections,
-	reset them. Default state is FALSE. It means that if overflow
-	occurred due to a burst, connection will recover. Enable this
-	option _only_ if you are really sure that listening daemon
-	cannot be tuned to accept connections faster. Enabling this
-	option can harm clients of your server.
-
-tcp_adv_win_scale - INTEGER
-	Count buffering overhead as bytes/2^tcp_adv_win_scale
-	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
-	if it is <= 0.
-	Possible values are [-31, 31], inclusive.
-	Default: 1
-
-tcp_allowed_congestion_control - STRING
-	Show/set the congestion control choices available to non-privileged
-	processes. The list is a subset of those listed in
-	tcp_available_congestion_control.
-	Default is "reno" and the default setting (tcp_congestion_control).
-
-tcp_app_win - INTEGER
-	Reserve max(window/2^tcp_app_win, mss) of window for application
-	buffer. Value 0 is special, it means that nothing is reserved.
-	Default: 31
-
-tcp_autocorking - BOOLEAN
-	Enable TCP auto corking :
-	When applications do consecutive small write()/sendmsg() system calls,
-	we try to coalesce these small writes as much as possible, to lower
-	total amount of sent packets. This is done if at least one prior
-	packet for the flow is waiting in Qdisc queues or device transmit
-	queue. Applications can still use TCP_CORK for optimal behavior
-	when they know how/when to uncork their sockets.
-	Default : 1
-
-tcp_available_congestion_control - STRING
-	Shows the available congestion control choices that are registered.
-	More congestion control algorithms may be available as modules,
-	but not loaded.
-
-tcp_base_mss - INTEGER
-	The initial value of search_low to be used by the packetization layer
-	Path MTU discovery (MTU probing).  If MTU probing is enabled,
-	this is the initial MSS used by the connection.
-
-tcp_mtu_probe_floor - INTEGER
-	If MTU probing is enabled this caps the minimum MSS used for search_low
-	for the connection.
-
-	Default : 48
-
-tcp_min_snd_mss - INTEGER
-	TCP SYN and SYNACK messages usually advertise an ADVMSS option,
-	as described in RFC 1122 and RFC 6691.
-	If this ADVMSS option is smaller than tcp_min_snd_mss,
-	it is silently capped to tcp_min_snd_mss.
-
-	Default : 48 (at least 8 bytes of payload per segment)
-
-tcp_congestion_control - STRING
-	Set the congestion control algorithm to be used for new
-	connections. The algorithm "reno" is always available, but
-	additional choices may be available based on kernel configuration.
-	Default is set as part of kernel configuration.
-	For passive connections, the listener congestion control choice
-	is inherited.
-	[see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ]
-
-tcp_dsack - BOOLEAN
-	Allows TCP to send "duplicate" SACKs.
-
-tcp_early_retrans - INTEGER
-	Tail loss probe (TLP) converts RTOs occurring due to tail
-	losses into fast recovery (draft-ietf-tcpm-rack). Note that
-	TLP requires RACK to function properly (see tcp_recovery below)
-	Possible values:
-		0 disables TLP
-		3 or 4 enables TLP
-	Default: 3
-
-tcp_ecn - INTEGER
-	Control use of Explicit Congestion Notification (ECN) by TCP.
-	ECN is used only when both ends of the TCP connection indicate
-	support for it.  This feature is useful in avoiding losses due
-	to congestion by allowing supporting routers to signal
-	congestion before having to drop packets.
-	Possible values are:
-		0 Disable ECN.  Neither initiate nor accept ECN.
-		1 Enable ECN when requested by incoming connections and
-		  also request ECN on outgoing connection attempts.
-		2 Enable ECN when requested by incoming connections
-		  but do not request ECN on outgoing connections.
-	Default: 2
-
-tcp_ecn_fallback - BOOLEAN
-	If the kernel detects that ECN connection misbehaves, enable fall
-	back to non-ECN. Currently, this knob implements the fallback
-	from RFC3168, section 6.1.1.1., but we reserve that in future,
-	additional detection mechanisms could be implemented under this
-	knob. The value	is not used, if tcp_ecn or per route (or congestion
-	control) ECN settings are disabled.
-	Default: 1 (fallback enabled)
-
-tcp_fack - BOOLEAN
-	This is a legacy option, it has no effect anymore.
-
-tcp_fin_timeout - INTEGER
-	The length of time an orphaned (no longer referenced by any
-	application) connection will remain in the FIN_WAIT_2 state
-	before it is aborted at the local end.  While a perfectly
-	valid "receive only" state for an un-orphaned connection, an
-	orphaned connection in FIN_WAIT_2 state could otherwise wait
-	forever for the remote to close its end of the connection.
-	Cf. tcp_max_orphans
-	Default: 60 seconds
-
-tcp_frto - INTEGER
-	Enables Forward RTO-Recovery (F-RTO) defined in RFC5682.
-	F-RTO is an enhanced recovery algorithm for TCP retransmission
-	timeouts.  It is particularly beneficial in networks where the
-	RTT fluctuates (e.g., wireless). F-RTO is sender-side only
-	modification. It does not require any support from the peer.
-
-	By default it's enabled with a non-zero value. 0 disables F-RTO.
-
-tcp_fwmark_accept - BOOLEAN
-	If set, incoming connections to listening sockets that do not have a
-	socket mark will set the mark of the accepting socket to the fwmark of
-	the incoming SYN packet. This will cause all packets on that connection
-	(starting from the first SYNACK) to be sent with that fwmark. The
-	listening socket's mark is unchanged. Listening sockets that already
-	have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
-	unaffected.
-
-	Default: 0
-
-tcp_invalid_ratelimit - INTEGER
-	Limit the maximal rate for sending duplicate acknowledgments
-	in response to incoming TCP packets that are for an existing
-	connection but that are invalid due to any of these reasons:
-
-	  (a) out-of-window sequence number,
-	  (b) out-of-window acknowledgment number, or
-	  (c) PAWS (Protection Against Wrapped Sequence numbers) check failure
-
-	This can help mitigate simple "ack loop" DoS attacks, wherein
-	a buggy or malicious middlebox or man-in-the-middle can
-	rewrite TCP header fields in manner that causes each endpoint
-	to think that the other is sending invalid TCP segments, thus
-	causing each side to send an unterminating stream of duplicate
-	acknowledgments for invalid segments.
-
-	Using 0 disables rate-limiting of dupacks in response to
-	invalid segments; otherwise this value specifies the minimal
-	space between sending such dupacks, in milliseconds.
-
-	Default: 500 (milliseconds).
-
-tcp_keepalive_time - INTEGER
-	How often TCP sends out keepalive messages when keepalive is enabled.
-	Default: 2hours.
-
-tcp_keepalive_probes - INTEGER
-	How many keepalive probes TCP sends out, until it decides that the
-	connection is broken. Default value: 9.
-
-tcp_keepalive_intvl - INTEGER
-	How frequently the probes are send out. Multiplied by
-	tcp_keepalive_probes it is time to kill not responding connection,
-	after probes started. Default value: 75sec i.e. connection
-	will be aborted after ~11 minutes of retries.
-
-tcp_l3mdev_accept - BOOLEAN
-	Enables child sockets to inherit the L3 master device index.
-	Enabling this option allows a "global" listen socket to work
-	across L3 master domains (e.g., VRFs) with connected sockets
-	derived from the listen socket to be bound to the L3 domain in
-	which the packets originated. Only valid when the kernel was
-	compiled with CONFIG_NET_L3_MASTER_DEV.
-        Default: 0 (disabled)
-
-tcp_low_latency - BOOLEAN
-	This is a legacy option, it has no effect anymore.
-
-tcp_max_orphans - INTEGER
-	Maximal number of TCP sockets not attached to any user file handle,
-	held by system.	If this number is exceeded orphaned connections are
-	reset immediately and warning is printed. This limit exists
-	only to prevent simple DoS attacks, you _must_ not rely on this
-	or lower the limit artificially, but rather increase it
-	(probably, after increasing installed memory),
-	if network conditions require more than default value,
-	and tune network services to linger and kill such states
-	more aggressively. Let me to remind again: each orphan eats
-	up to ~64K of unswappable memory.
-
-tcp_max_syn_backlog - INTEGER
-	Maximal number of remembered connection requests (SYN_RECV),
-	which have not received an acknowledgment from connecting client.
-	This is a per-listener limit.
-	The minimal value is 128 for low memory machines, and it will
-	increase in proportion to the memory of machine.
-	If server suffers from overload, try increasing this number.
-	Remember to also check /proc/sys/net/core/somaxconn
-	A SYN_RECV request socket consumes about 304 bytes of memory.
-
-tcp_max_tw_buckets - INTEGER
-	Maximal number of timewait sockets held by system simultaneously.
-	If this number is exceeded time-wait socket is immediately destroyed
-	and warning is printed. This limit exists only to prevent
-	simple DoS attacks, you _must_ not lower the limit artificially,
-	but rather increase it (probably, after increasing installed memory),
-	if network conditions require more than default value.
-
-tcp_mem - vector of 3 INTEGERs: min, pressure, max
-	min: below this number of pages TCP is not bothered about its
-	memory appetite.
-
-	pressure: when amount of memory allocated by TCP exceeds this number
-	of pages, TCP moderates its memory consumption and enters memory
-	pressure mode, which is exited when memory consumption falls
-	under "min".
-
-	max: number of pages allowed for queueing by all TCP sockets.
-
-	Defaults are calculated at boot time from amount of available
-	memory.
-
-tcp_min_rtt_wlen - INTEGER
-	The window length of the windowed min filter to track the minimum RTT.
-	A shorter window lets a flow more quickly pick up new (higher)
-	minimum RTT when it is moved to a longer path (e.g., due to traffic
-	engineering). A longer window makes the filter more resistant to RTT
-	inflations such as transient congestion. The unit is seconds.
-	Possible values: 0 - 86400 (1 day)
-	Default: 300
-
-tcp_moderate_rcvbuf - BOOLEAN
-	If set, TCP performs receive buffer auto-tuning, attempting to
-	automatically size the buffer (no greater than tcp_rmem[2]) to
-	match the size required by the path for full throughput.  Enabled by
-	default.
-
-tcp_mtu_probing - INTEGER
-	Controls TCP Packetization-Layer Path MTU Discovery.  Takes three
-	values:
-	  0 - Disabled
-	  1 - Disabled by default, enabled when an ICMP black hole detected
-	  2 - Always enabled, use initial MSS of tcp_base_mss.
-
-tcp_probe_interval - UNSIGNED INTEGER
-	Controls how often to start TCP Packetization-Layer Path MTU
-	Discovery reprobe. The default is reprobing every 10 minutes as
-	per RFC4821.
-
-tcp_probe_threshold - INTEGER
-	Controls when TCP Packetization-Layer Path MTU Discovery probing
-	will stop in respect to the width of search range in bytes. Default
-	is 8 bytes.
-
-tcp_no_metrics_save - BOOLEAN
-	By default, TCP saves various connection metrics in the route cache
-	when the connection closes, so that connections established in the
-	near future can use these to set initial conditions.  Usually, this
-	increases overall performance, but may sometimes cause performance
-	degradation.  If set, TCP will not cache metrics on closing
-	connections.
-
-tcp_no_ssthresh_metrics_save - BOOLEAN
-	Controls whether TCP saves ssthresh metrics in the route cache.
-	Default is 1, which disables ssthresh metrics.
-
-tcp_orphan_retries - INTEGER
-	This value influences the timeout of a locally closed TCP connection,
-	when RTO retransmissions remain unacknowledged.
-	See tcp_retries2 for more details.
-
-	The default value is 8.
-	If your machine is a loaded WEB server,
-	you should think about lowering this value, such sockets
-	may consume significant resources. Cf. tcp_max_orphans.
-
-tcp_recovery - INTEGER
-	This value is a bitmap to enable various experimental loss recovery
-	features.
-
-	RACK: 0x1 enables the RACK loss detection for fast detection of lost
-	      retransmissions and tail drops. It also subsumes and disables
-	      RFC6675 recovery for SACK connections.
-	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
-	RACK: 0x4 disables RACK's DUPACK threshold heuristic
-
-	Default: 0x1
-
-tcp_reordering - INTEGER
-	Initial reordering level of packets in a TCP stream.
-	TCP stack can then dynamically adjust flow reordering level
-	between this initial value and tcp_max_reordering
-	Default: 3
-
-tcp_max_reordering - INTEGER
-	Maximal reordering level of packets in a TCP stream.
-	300 is a fairly conservative value, but you might increase it
-	if paths are using per packet load balancing (like bonding rr mode)
-	Default: 300
-
-tcp_retrans_collapse - BOOLEAN
-	Bug-to-bug compatibility with some broken printers.
-	On retransmit try to send bigger packets to work around bugs in
-	certain TCP stacks.
-
-tcp_retries1 - INTEGER
-	This value influences the time, after which TCP decides, that
-	something is wrong due to unacknowledged RTO retransmissions,
-	and reports this suspicion to the network layer.
-	See tcp_retries2 for more details.
-
-	RFC 1122 recommends at least 3 retransmissions, which is the
-	default.
-
-tcp_retries2 - INTEGER
-	This value influences the timeout of an alive TCP connection,
-	when RTO retransmissions remain unacknowledged.
-	Given a value of N, a hypothetical TCP connection following
-	exponential backoff with an initial RTO of TCP_RTO_MIN would
-	retransmit N times before killing the connection at the (N+1)th RTO.
-
-	The default value of 15 yields a hypothetical timeout of 924.6
-	seconds and is a lower bound for the effective timeout.
-	TCP will effectively time out at the first RTO which exceeds the
-	hypothetical timeout.
-
-	RFC 1122 recommends at least 100 seconds for the timeout,
-	which corresponds to a value of at least 8.
-
-tcp_rfc1337 - BOOLEAN
-	If set, the TCP stack behaves conforming to RFC1337. If unset,
-	we are not conforming to RFC, but prevent TCP TIME_WAIT
-	assassination.
-	Default: 0
-
-tcp_rmem - vector of 3 INTEGERs: min, default, max
-	min: Minimal size of receive buffer used by TCP sockets.
-	It is guaranteed to each TCP socket, even under moderate memory
-	pressure.
-	Default: 4K
-
-	default: initial size of receive buffer used by TCP sockets.
-	This value overrides net.core.rmem_default used by other protocols.
-	Default: 87380 bytes. This value results in window of 65535 with
-	default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
-	less for default tcp_app_win. See below about these variables.
-
-	max: maximal size of receive buffer allowed for automatically
-	selected receiver buffers for TCP socket. This value does not override
-	net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
-	automatic tuning of that socket's receive buffer size, in which
-	case this value is ignored.
-	Default: between 87380B and 6MB, depending on RAM size.
-
-tcp_sack - BOOLEAN
-	Enable select acknowledgments (SACKS).
-
-tcp_comp_sack_delay_ns - LONG INTEGER
-	TCP tries to reduce number of SACK sent, using a timer
-	based on 5% of SRTT, capped by this sysctl, in nano seconds.
-	The default is 1ms, based on TSO autosizing period.
-
-	Default : 1,000,000 ns (1 ms)
-
-tcp_comp_sack_nr - INTEGER
-	Max number of SACK that can be compressed.
-	Using 0 disables SACK compression.
-
-	Default : 44
-
-tcp_slow_start_after_idle - BOOLEAN
-	If set, provide RFC2861 behavior and time out the congestion
-	window after an idle period.  An idle period is defined at
-	the current RTO.  If unset, the congestion window will not
-	be timed out after an idle period.
-	Default: 1
-
-tcp_stdurg - BOOLEAN
-	Use the Host requirements interpretation of the TCP urgent pointer field.
-	Most hosts use the older BSD interpretation, so if you turn this on
-	Linux might not communicate correctly with them.
-	Default: FALSE
-
-tcp_synack_retries - INTEGER
-	Number of times SYNACKs for a passive TCP connection attempt will
-	be retransmitted. Should not be higher than 255. Default value
-	is 5, which corresponds to 31seconds till the last retransmission
-	with the current initial RTO of 1second. With this the final timeout
-	for a passive TCP connection will happen after 63seconds.
-
-tcp_syncookies - INTEGER
-	Only valid when the kernel was compiled with CONFIG_SYN_COOKIES
-	Send out syncookies when the syn backlog queue of a socket
-	overflows. This is to prevent against the common 'SYN flood attack'
-	Default: 1
-
-	Note, that syncookies is fallback facility.
-	It MUST NOT be used to help highly loaded servers to stand
-	against legal connection rate. If you see SYN flood warnings
-	in your logs, but investigation	shows that they occur
-	because of overload with legal connections, you should tune
-	another parameters until this warning disappear.
-	See: tcp_max_syn_backlog, tcp_synack_retries, tcp_abort_on_overflow.
-
-	syncookies seriously violate TCP protocol, do not allow
-	to use TCP extensions, can result in serious degradation
-	of some services (f.e. SMTP relaying), visible not by you,
-	but your clients and relays, contacting you. While you see
-	SYN flood warnings in logs not being really flooded, your server
-	is seriously misconfigured.
-
-	If you want to test which effects syncookies have to your
-	network connections you can set this knob to 2 to enable
-	unconditionally generation of syncookies.
-
-tcp_fastopen - INTEGER
-	Enable TCP Fast Open (RFC7413) to send and accept data in the opening
-	SYN packet.
-
-	The client support is enabled by flag 0x1 (on by default). The client
-	then must use sendmsg() or sendto() with the MSG_FASTOPEN flag,
-	rather than connect() to send data in SYN.
-
-	The server support is enabled by flag 0x2 (off by default). Then
-	either enable for all listeners with another flag (0x400) or
-	enable individual listeners via TCP_FASTOPEN socket option with
-	the option value being the length of the syn-data backlog.
-
-	The values (bitmap) are
-	  0x1: (client) enables sending data in the opening SYN on the client.
-	  0x2: (server) enables the server support, i.e., allowing data in
-			a SYN packet to be accepted and passed to the
-			application before 3-way handshake finishes.
-	  0x4: (client) send data in the opening SYN regardless of cookie
-			availability and without a cookie option.
-	0x200: (server) accept data-in-SYN w/o any cookie option present.
-	0x400: (server) enable all listeners to support Fast Open by
-			default without explicit TCP_FASTOPEN socket option.
-
-	Default: 0x1
-
-	Note that that additional client or server features are only
-	effective if the basic support (0x1 and 0x2) are enabled respectively.
-
-tcp_fastopen_blackhole_timeout_sec - INTEGER
-	Initial time period in second to disable Fastopen on active TCP sockets
-	when a TFO firewall blackhole issue happens.
-	This time period will grow exponentially when more blackhole issues
-	get detected right after Fastopen is re-enabled and will reset to
-	initial value when the blackhole issue goes away.
-	0 to disable the blackhole detection.
-	By default, it is set to 1hr.
-
-tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
-	The list consists of a primary key and an optional backup key. The
-	primary key is used for both creating and validating cookies, while the
-	optional backup key is only used for validating cookies. The purpose of
-	the backup key is to maximize TFO validation when keys are rotated.
-
-	A randomly chosen primary key may be configured by the kernel if
-	the tcp_fastopen sysctl is set to 0x400 (see above), or if the
-	TCP_FASTOPEN setsockopt() optname is set and a key has not been
-	previously configured via sysctl. If keys are configured via
-	setsockopt() by using the TCP_FASTOPEN_KEY optname, then those
-	per-socket keys will be used instead of any keys that are specified via
-	sysctl.
-
-	A key is specified as 4 8-digit hexadecimal integers which are separated
-	by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be
-	omitted. A primary and a backup key may be specified by separating them
-	by a comma. If only one key is specified, it becomes the primary key and
-	any previously configured backup keys are removed.
-
-tcp_syn_retries - INTEGER
-	Number of times initial SYNs for an active TCP connection attempt
-	will be retransmitted. Should not be higher than 127. Default value
-	is 6, which corresponds to 63seconds till the last retransmission
-	with the current initial RTO of 1second. With this the final timeout
-	for an active TCP connection attempt will happen after 127seconds.
-
-tcp_timestamps - INTEGER
-Enable timestamps as defined in RFC1323.
-	0: Disabled.
-	1: Enable timestamps as defined in RFC1323 and use random offset for
-	each connection rather than only using the current time.
-	2: Like 1, but without random offsets.
-	Default: 1
-
-tcp_min_tso_segs - INTEGER
-	Minimal number of segments per TSO frame.
-	Since linux-3.12, TCP does an automatic sizing of TSO frames,
-	depending on flow rate, instead of filling 64Kbytes packets.
-	For specific usages, it's possible to force TCP to build big
-	TSO frames. Note that TCP stack might split too big TSO packets
-	if available window is too small.
-	Default: 2
-
-tcp_pacing_ss_ratio - INTEGER
-	sk->sk_pacing_rate is set by TCP stack using a ratio applied
-	to current rate. (current_rate = cwnd * mss / srtt)
-	If TCP is in slow start, tcp_pacing_ss_ratio is applied
-	to let TCP probe for bigger speeds, assuming cwnd can be
-	doubled every other RTT.
-	Default: 200
-
-tcp_pacing_ca_ratio - INTEGER
-	sk->sk_pacing_rate is set by TCP stack using a ratio applied
-	to current rate. (current_rate = cwnd * mss / srtt)
-	If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
-	is applied to conservatively probe for bigger throughput.
-	Default: 120
-
-tcp_tso_win_divisor - INTEGER
-	This allows control over what percentage of the congestion window
-	can be consumed by a single TSO frame.
-	The setting of this parameter is a choice between burstiness and
-	building larger TSO frames.
-	Default: 3
-
-tcp_tw_reuse - INTEGER
-	Enable reuse of TIME-WAIT sockets for new connections when it is
-	safe from protocol viewpoint.
-	0 - disable
-	1 - global enable
-	2 - enable for loopback traffic only
-	It should not be changed without advice/request of technical
-	experts.
-	Default: 2
-
-tcp_window_scaling - BOOLEAN
-	Enable window scaling as defined in RFC1323.
-
-tcp_wmem - vector of 3 INTEGERs: min, default, max
-	min: Amount of memory reserved for send buffers for TCP sockets.
-	Each TCP socket has rights to use it due to fact of its birth.
-	Default: 4K
-
-	default: initial size of send buffer used by TCP sockets.  This
-	value overrides net.core.wmem_default used by other protocols.
-	It is usually lower than net.core.wmem_default.
-	Default: 16K
-
-	max: Maximal amount of memory allowed for automatically tuned
-	send buffers for TCP sockets. This value does not override
-	net.core.wmem_max.  Calling setsockopt() with SO_SNDBUF disables
-	automatic tuning of that socket's send buffer size, in which case
-	this value is ignored.
-	Default: between 64K and 4MB, depending on RAM size.
-
-tcp_notsent_lowat - UNSIGNED INTEGER
-	A TCP socket can control the amount of unsent bytes in its write queue,
-	thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
-	reports POLLOUT events if the amount of unsent bytes is below a per
-	socket value, and if the write queue is not full. sendmsg() will
-	also not add new buffers if the limit is hit.
-
-	This global variable controls the amount of unsent data for
-	sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
-	to the global variable has immediate effect.
-
-	Default: UINT_MAX (0xFFFFFFFF)
-
-tcp_workaround_signed_windows - BOOLEAN
-	If set, assume no receipt of a window scaling option means the
-	remote TCP is broken and treats the window as a signed quantity.
-	If unset, assume the remote TCP is not broken even if we do
-	not receive a window scaling option from them.
-	Default: 0
-
-tcp_thin_linear_timeouts - BOOLEAN
-	Enable dynamic triggering of linear timeouts for thin streams.
-	If set, a check is performed upon retransmission by timeout to
-	determine if the stream is thin (less than 4 packets in flight).
-	As long as the stream is found to be thin, up to 6 linear
-	timeouts may be performed before exponential backoff mode is
-	initiated. This improves retransmission latency for
-	non-aggressive thin streams, often found to be time-dependent.
-	For more information on thin streams, see
-	Documentation/networking/tcp-thin.txt
-	Default: 0
-
-tcp_limit_output_bytes - INTEGER
-	Controls TCP Small Queue limit per tcp socket.
-	TCP bulk sender tends to increase packets in flight until it
-	gets losses notifications. With SNDBUF autotuning, this can
-	result in a large amount of packets queued on the local machine
-	(e.g.: qdiscs, CPU backlog, or device) hurting latency of other
-	flows, for typical pfifo_fast qdiscs.  tcp_limit_output_bytes
-	limits the number of bytes on qdisc or device to reduce artificial
-	RTT/cwnd and reduce bufferbloat.
-	Default: 1048576 (16 * 65536)
-
-tcp_challenge_ack_limit - INTEGER
-	Limits number of Challenge ACK sent per second, as recommended
-	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
-	Default: 1000
-
-tcp_rx_skb_cache - BOOLEAN
-	Controls a per TCP socket cache of one skb, that might help
-	performance of some workloads. This might be dangerous
-	on systems with a lot of TCP sockets, since it increases
-	memory usage.
-
-	Default: 0 (disabled)
-
-UDP variables:
-
-udp_l3mdev_accept - BOOLEAN
-	Enabling this option allows a "global" bound socket to work
-	across L3 master domains (e.g., VRFs) with packets capable of
-	being received regardless of the L3 domain in which they
-	originated. Only valid when the kernel was compiled with
-	CONFIG_NET_L3_MASTER_DEV.
-        Default: 0 (disabled)
-
-udp_mem - vector of 3 INTEGERs: min, pressure, max
-	Number of pages allowed for queueing by all UDP sockets.
-
-	min: Below this number of pages UDP is not bothered about its
-	memory appetite. When amount of memory allocated by UDP exceeds
-	this number, UDP starts to moderate memory usage.
-
-	pressure: This value was introduced to follow format of tcp_mem.
-
-	max: Number of pages allowed for queueing by all UDP sockets.
-
-	Default is calculated at boot time from amount of available memory.
-
-udp_rmem_min - INTEGER
-	Minimal size of receive buffer used by UDP sockets in moderation.
-	Each UDP socket is able to use the size for receiving data, even if
-	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-	Default: 4K
-
-udp_wmem_min - INTEGER
-	Minimal size of send buffer used by UDP sockets in moderation.
-	Each UDP socket is able to use the size for sending data, even if
-	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-	Default: 4K
-
-RAW variables:
-
-raw_l3mdev_accept - BOOLEAN
-	Enabling this option allows a "global" bound socket to work
-	across L3 master domains (e.g., VRFs) with packets capable of
-	being received regardless of the L3 domain in which they
-	originated. Only valid when the kernel was compiled with
-	CONFIG_NET_L3_MASTER_DEV.
-	Default: 1 (enabled)
-
-CIPSOv4 Variables:
-
-cipso_cache_enable - BOOLEAN
-	If set, enable additions to and lookups from the CIPSO label mapping
-	cache.  If unset, additions are ignored and lookups always result in a
-	miss.  However, regardless of the setting the cache is still
-	invalidated when required when means you can safely toggle this on and
-	off and the cache will always be "safe".
-	Default: 1
-
-cipso_cache_bucket_size - INTEGER
-	The CIPSO label cache consists of a fixed size hash table with each
-	hash bucket containing a number of cache entries.  This variable limits
-	the number of entries in each hash bucket; the larger the value the
-	more CIPSO label mappings that can be cached.  When the number of
-	entries in a given hash bucket reaches this limit adding new entries
-	causes the oldest entry in the bucket to be removed to make room.
-	Default: 10
-
-cipso_rbm_optfmt - BOOLEAN
-	Enable the "Optimized Tag 1 Format" as defined in section 3.4.2.6 of
-	the CIPSO draft specification (see Documentation/netlabel for details).
-	This means that when set the CIPSO tag will be padded with empty
-	categories in order to make the packet data 32-bit aligned.
-	Default: 0
-
-cipso_rbm_structvalid - BOOLEAN
-	If set, do a very strict check of the CIPSO option when
-	ip_options_compile() is called.  If unset, relax the checks done during
-	ip_options_compile().  Either way is "safe" as errors are caught else
-	where in the CIPSO processing code but setting this to 0 (False) should
-	result in less work (i.e. it should be faster) but could cause problems
-	with other implementations that require strict checking.
-	Default: 0
-
-IP Variables:
-
-ip_local_port_range - 2 INTEGERS
-	Defines the local port range that is used by TCP and UDP to
-	choose the local port. The first number is the first, the
-	second the last local port number.
-	If possible, it is better these numbers have different parity
-	(one even and one odd value).
-	Must be greater than or equal to ip_unprivileged_port_start.
-	The default values are 32768 and 60999 respectively.
-
-ip_local_reserved_ports - list of comma separated ranges
-	Specify the ports which are reserved for known third-party
-	applications. These ports will not be used by automatic port
-	assignments (e.g. when calling connect() or bind() with port
-	number 0). Explicit port allocation behavior is unchanged.
-
-	The format used for both input and output is a comma separated
-	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
-	10). Writing to the file will clear all previously reserved
-	ports and update the current list with the one given in the
-	input.
-
-	Note that ip_local_port_range and ip_local_reserved_ports
-	settings are independent and both are considered by the kernel
-	when determining which ports are available for automatic port
-	assignments.
-
-	You can reserve ports which are not in the current
-	ip_local_port_range, e.g.:
-
-	$ cat /proc/sys/net/ipv4/ip_local_port_range
-	32000	60999
-	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
-	8080,9148
-
-	although this is redundant. However such a setting is useful
-	if later the port range is changed to a value that will
-	include the reserved ports.
-
-	Default: Empty
-
-ip_unprivileged_port_start - INTEGER
-	This is a per-namespace sysctl.  It defines the first
-	unprivileged port in the network namespace.  Privileged ports
-	require root or CAP_NET_BIND_SERVICE in order to bind to them.
-	To disable all privileged ports, set this to 0.  They must not
-	overlap with the ip_local_port_range.
-
-	Default: 1024
-
-ip_nonlocal_bind - BOOLEAN
-	If set, allows processes to bind() to non-local IP addresses,
-	which can be quite useful - but may break some applications.
-	Default: 0
-
-ip_autobind_reuse - BOOLEAN
-	By default, bind() does not select the ports automatically even if
-	the new socket and all sockets bound to the port have SO_REUSEADDR.
-	ip_autobind_reuse allows bind() to reuse the port and this is useful
-	when you use bind()+connect(), but may break some applications.
-	The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
-	option should only be set by experts.
-	Default: 0
-
-ip_dynaddr - BOOLEAN
-	If set non-zero, enables support for dynamic addresses.
-	If set to a non-zero value larger than 1, a kernel log
-	message will be printed when dynamic address rewriting
-	occurs.
-	Default: 0
-
-ip_early_demux - BOOLEAN
-	Optimize input packet processing down to one demux for
-	certain kinds of local sockets.  Currently we only do this
-	for established TCP and connected UDP sockets.
-
-	It may add an additional cost for pure routing workloads that
-	reduces overall throughput, in such case you should disable it.
-	Default: 1
-
-ping_group_range - 2 INTEGERS
-	Restrict ICMP_PROTO datagram sockets to users in the group range.
-	The default is "1 0", meaning, that nobody (not even root) may
-	create ping sockets.  Setting it to "100 100" would grant permissions
-	to the single group. "0 4294967295" would enable it for the world, "100
-	4294967295" would enable it for the users, but not daemons.
-
-tcp_early_demux - BOOLEAN
-	Enable early demux for established TCP sockets.
-	Default: 1
-
-udp_early_demux - BOOLEAN
-	Enable early demux for connected UDP sockets. Disable this if
-	your system could experience more unconnected load.
-	Default: 1
-
-icmp_echo_ignore_all - BOOLEAN
-	If set non-zero, then the kernel will ignore all ICMP ECHO
-	requests sent to it.
-	Default: 0
-
-icmp_echo_ignore_broadcasts - BOOLEAN
-	If set non-zero, then the kernel will ignore all ICMP ECHO and
-	TIMESTAMP requests sent to it via broadcast/multicast.
-	Default: 1
-
-icmp_ratelimit - INTEGER
-	Limit the maximal rates for sending ICMP packets whose type matches
-	icmp_ratemask (see below) to specific targets.
-	0 to disable any limiting,
-	otherwise the minimal space between responses in milliseconds.
-	Note that another sysctl, icmp_msgs_per_sec limits the number
-	of ICMP packets	sent on all targets.
-	Default: 1000
-
-icmp_msgs_per_sec - INTEGER
-	Limit maximal number of ICMP packets sent per second from this host.
-	Only messages whose type matches icmp_ratemask (see below) are
-	controlled by this limit.
-	Default: 1000
-
-icmp_msgs_burst - INTEGER
-	icmp_msgs_per_sec controls number of ICMP packets sent per second,
-	while icmp_msgs_burst controls the burst size of these packets.
-	Default: 50
-
-icmp_ratemask - INTEGER
-	Mask made of ICMP types for which rates are being limited.
-	Significant bits: IHGFEDCBA9876543210
-	Default mask:     0000001100000011000 (6168)
-
-	Bit definitions (see include/linux/icmp.h):
-		0 Echo Reply
-		3 Destination Unreachable *
-		4 Source Quench *
-		5 Redirect
-		8 Echo Request
-		B Time Exceeded *
-		C Parameter Problem *
-		D Timestamp Request
-		E Timestamp Reply
-		F Info Request
-		G Info Reply
-		H Address Mask Request
-		I Address Mask Reply
-
-	* These are rate limited by default (see default mask above)
-
-icmp_ignore_bogus_error_responses - BOOLEAN
-	Some routers violate RFC1122 by sending bogus responses to broadcast
-	frames.  Such violations are normally logged via a kernel warning.
-	If this is set to TRUE, the kernel will not give such warnings, which
-	will avoid log file clutter.
-	Default: 1
-
-icmp_errors_use_inbound_ifaddr - BOOLEAN
-
-	If zero, icmp error messages are sent with the primary address of
-	the exiting interface.
-
-	If non-zero, the message will be sent with the primary address of
-	the interface that received the packet that caused the icmp error.
-	This is the behaviour network many administrators will expect from
-	a router. And it can make debugging complicated network layouts
-	much easier.
-
-	Note that if no primary address exists for the interface selected,
-	then the primary address of the first non-loopback interface that
-	has one will be used regardless of this setting.
-
-	Default: 0
-
-igmp_max_memberships - INTEGER
-	Change the maximum number of multicast groups we can subscribe to.
-	Default: 20
-
-	Theoretical maximum value is bounded by having to send a membership
-	report in a single datagram (i.e. the report can't span multiple
-	datagrams, or risk confusing the switch and leaving groups you don't
-	intend to).
-
-	The number of supported groups 'M' is bounded by the number of group
-	report entries you can fit into a single datagram of 65535 bytes.
-
-	M = 65536-sizeof (ip header)/(sizeof(Group record))
-
-	Group records are variable length, with a minimum of 12 bytes.
-	So net.ipv4.igmp_max_memberships should not be set higher than:
-
-	(65536-24) / 12 = 5459
-
-	The value 5459 assumes no IP header options, so in practice
-	this number may be lower.
-
-igmp_max_msf - INTEGER
-	Maximum number of addresses allowed in the source filter list for a
-	multicast group.
-	Default: 10
-
-igmp_qrv - INTEGER
-	Controls the IGMP query robustness variable (see RFC2236 8.1).
-	Default: 2 (as specified by RFC2236 8.1)
-	Minimum: 1 (as specified by RFC6636 4.5)
-
-force_igmp_version - INTEGER
-	0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback
-	    allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier
-	    Present timer expires.
-	1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if
-	    receive IGMPv2/v3 query.
-	2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive
-	    IGMPv1 query message. Will reply report if receive IGMPv3 query.
-	3 - Enforce to use IGMP version 3. The same react with default 0.
-
-	Note: this is not the same with force_mld_version because IGMPv3 RFC3376
-	Security Considerations does not have clear description that we could
-	ignore other version messages completely as MLDv2 RFC3810. So make
-	this value as default 0 is recommended.
-
-conf/interface/*  changes special settings per interface (where
-"interface" is the name of your network interface)
-
-conf/all/*	  is special, changes the settings for all interfaces
-
-log_martians - BOOLEAN
-	Log packets with impossible addresses to kernel log.
-	log_martians for the interface will be enabled if at least one of
-	conf/{all,interface}/log_martians is set to TRUE,
-	it will be disabled otherwise
-
-accept_redirects - BOOLEAN
-	Accept ICMP redirect messages.
-	accept_redirects for the interface will be enabled if:
-	- both conf/{all,interface}/accept_redirects are TRUE in the case
-	  forwarding for the interface is enabled
-	or
-	- at least one of conf/{all,interface}/accept_redirects is TRUE in the
-	  case forwarding for the interface is disabled
-	accept_redirects for the interface will be disabled otherwise
-	default TRUE (host)
-		FALSE (router)
-
-forwarding - BOOLEAN
-	Enable IP forwarding on this interface.  This controls whether packets
-	received _on_ this interface can be forwarded.
-
-mc_forwarding - BOOLEAN
-	Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
-	and a multicast routing daemon is required.
-	conf/all/mc_forwarding must also be set to TRUE to enable multicast
-	routing	for the interface
-
-medium_id - INTEGER
-	Integer value used to differentiate the devices by the medium they
-	are attached to. Two devices can have different id values when
-	the broadcast packets are received only on one of them.
-	The default value 0 means that the device is the only interface
-	to its medium, value of -1 means that medium is not known.
-
-	Currently, it is used to change the proxy_arp behavior:
-	the proxy_arp feature is enabled for packets forwarded between
-	two devices attached to different media.
-
-proxy_arp - BOOLEAN
-	Do proxy arp.
-	proxy_arp for the interface will be enabled if at least one of
-	conf/{all,interface}/proxy_arp is set to TRUE,
-	it will be disabled otherwise
-
-proxy_arp_pvlan - BOOLEAN
-	Private VLAN proxy arp.
-	Basically allow proxy arp replies back to the same interface
-	(from which the ARP request/solicitation was received).
-
-	This is done to support (ethernet) switch features, like RFC
-	3069, where the individual ports are NOT allowed to
-	communicate with each other, but they are allowed to talk to
-	the upstream router.  As described in RFC 3069, it is possible
-	to allow these hosts to communicate through the upstream
-	router by proxy_arp'ing. Don't need to be used together with
-	proxy_arp.
-
-	This technology is known by different names:
-	  In RFC 3069 it is called VLAN Aggregation.
-	  Cisco and Allied Telesyn call it Private VLAN.
-	  Hewlett-Packard call it Source-Port filtering or port-isolation.
-	  Ericsson call it MAC-Forced Forwarding (RFC Draft).
-
-shared_media - BOOLEAN
-	Send(router) or accept(host) RFC1620 shared media redirects.
-	Overrides secure_redirects.
-	shared_media for the interface will be enabled if at least one of
-	conf/{all,interface}/shared_media is set to TRUE,
-	it will be disabled otherwise
-	default TRUE
-
-secure_redirects - BOOLEAN
-	Accept ICMP redirect messages only to gateways listed in the
-	interface's current gateway list. Even if disabled, RFC1122 redirect
-	rules still apply.
-	Overridden by shared_media.
-	secure_redirects for the interface will be enabled if at least one of
-	conf/{all,interface}/secure_redirects is set to TRUE,
-	it will be disabled otherwise
-	default TRUE
-
-send_redirects - BOOLEAN
-	Send redirects, if router.
-	send_redirects for the interface will be enabled if at least one of
-	conf/{all,interface}/send_redirects is set to TRUE,
-	it will be disabled otherwise
-	Default: TRUE
-
-bootp_relay - BOOLEAN
-	Accept packets with source address 0.b.c.d destined
-	not to this host as local ones. It is supposed, that
-	BOOTP relay daemon will catch and forward such packets.
-	conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay
-	for the interface
-	default FALSE
-	Not Implemented Yet.
-
-accept_source_route - BOOLEAN
-	Accept packets with SRR option.
-	conf/all/accept_source_route must also be set to TRUE to accept packets
-	with SRR option on the interface
-	default TRUE (router)
-		FALSE (host)
-
-accept_local - BOOLEAN
-	Accept packets with local source addresses. In combination with
-	suitable routing, this can be used to direct packets between two
-	local interfaces over the wire and have them accepted properly.
-	default FALSE
-
-route_localnet - BOOLEAN
-	Do not consider loopback addresses as martian source or destination
-	while routing. This enables the use of 127/8 for local routing purposes.
-	default FALSE
-
-rp_filter - INTEGER
-	0 - No source validation.
-	1 - Strict mode as defined in RFC3704 Strict Reverse Path
-	    Each incoming packet is tested against the FIB and if the interface
-	    is not the best reverse path the packet check will fail.
-	    By default failed packets are discarded.
-	2 - Loose mode as defined in RFC3704 Loose Reverse Path
-	    Each incoming packet's source address is also tested against the FIB
-	    and if the source address is not reachable via any interface
-	    the packet check will fail.
-
-	Current recommended practice in RFC3704 is to enable strict mode
-	to prevent IP spoofing from DDos attacks. If using asymmetric routing
-	or other complicated routing, then loose mode is recommended.
-
-	The max value from conf/{all,interface}/rp_filter is used
-	when doing source validation on the {interface}.
-
-	Default value is 0. Note that some distributions enable it
-	in startup scripts.
-
-arp_filter - BOOLEAN
-	1 - Allows you to have multiple network interfaces on the same
-	subnet, and have the ARPs for each interface be answered
-	based on whether or not the kernel would route a packet from
-	the ARP'd IP out that interface (therefore you must use source
-	based routing for this to work). In other words it allows control
-	of which cards (usually 1) will respond to an arp request.
-
-	0 - (default) The kernel can respond to arp requests with addresses
-	from other interfaces. This may seem wrong but it usually makes
-	sense, because it increases the chance of successful communication.
-	IP addresses are owned by the complete host on Linux, not by
-	particular interfaces. Only for more complex setups like load-
-	balancing, does this behaviour cause problems.
-
-	arp_filter for the interface will be enabled if at least one of
-	conf/{all,interface}/arp_filter is set to TRUE,
-	it will be disabled otherwise
-
-arp_announce - INTEGER
-	Define different restriction levels for announcing the local
-	source IP address from IP packets in ARP requests sent on
-	interface:
-	0 - (default) Use any local address, configured on any interface
-	1 - Try to avoid local addresses that are not in the target's
-	subnet for this interface. This mode is useful when target
-	hosts reachable via this interface require the source IP
-	address in ARP requests to be part of their logical network
-	configured on the receiving interface. When we generate the
-	request we will check all our subnets that include the
-	target IP and will preserve the source address if it is from
-	such subnet. If there is no such subnet we select source
-	address according to the rules for level 2.
-	2 - Always use the best local address for this target.
-	In this mode we ignore the source address in the IP packet
-	and try to select local address that we prefer for talks with
-	the target host. Such local address is selected by looking
-	for primary IP addresses on all our subnets on the outgoing
-	interface that include the target IP address. If no suitable
-	local address is found we select the first local address
-	we have on the outgoing interface or on all other interfaces,
-	with the hope we will receive reply for our request and
-	even sometimes no matter the source IP address we announce.
-
-	The max value from conf/{all,interface}/arp_announce is used.
-
-	Increasing the restriction level gives more chance for
-	receiving answer from the resolved target while decreasing
-	the level announces more valid sender's information.
-
-arp_ignore - INTEGER
-	Define different modes for sending replies in response to
-	received ARP requests that resolve local target IP addresses:
-	0 - (default): reply for any local target IP address, configured
-	on any interface
-	1 - reply only if the target IP address is local address
-	configured on the incoming interface
-	2 - reply only if the target IP address is local address
-	configured on the incoming interface and both with the
-	sender's IP address are part from same subnet on this interface
-	3 - do not reply for local addresses configured with scope host,
-	only resolutions for global and link addresses are replied
-	4-7 - reserved
-	8 - do not reply for all local addresses
-
-	The max value from conf/{all,interface}/arp_ignore is used
-	when ARP request is received on the {interface}
-
-arp_notify - BOOLEAN
-	Define mode for notification of address and device changes.
-	0 - (default): do nothing
-	1 - Generate gratuitous arp requests when device is brought up
-	    or hardware address changes.
-
-arp_accept - BOOLEAN
-	Define behavior for gratuitous ARP frames who's IP is not
-	already present in the ARP table:
-	0 - don't create new entries in the ARP table
-	1 - create new entries in the ARP table
-
-	Both replies and requests type gratuitous arp will trigger the
-	ARP table to be updated, if this setting is on.
-
-	If the ARP table already contains the IP address of the
-	gratuitous arp frame, the arp table will be updated regardless
-	if this setting is on or off.
-
-mcast_solicit - INTEGER
-	The maximum number of multicast probes in INCOMPLETE state,
-	when the associated hardware address is unknown.  Defaults
-	to 3.
-
-ucast_solicit - INTEGER
-	The maximum number of unicast probes in PROBE state, when
-	the hardware address is being reconfirmed.  Defaults to 3.
-
-app_solicit - INTEGER
-	The maximum number of probes to send to the user space ARP daemon
-	via netlink before dropping back to multicast probes (see
-	mcast_resolicit).  Defaults to 0.
-
-mcast_resolicit - INTEGER
-	The maximum number of multicast probes after unicast and
-	app probes in PROBE state.  Defaults to 0.
-
-disable_policy - BOOLEAN
-	Disable IPSEC policy (SPD) for this interface
-
-disable_xfrm - BOOLEAN
-	Disable IPSEC encryption on this interface, whatever the policy
-
-igmpv2_unsolicited_report_interval - INTEGER
-	The interval in milliseconds in which the next unsolicited
-	IGMPv1 or IGMPv2 report retransmit will take place.
-	Default: 10000 (10 seconds)
-
-igmpv3_unsolicited_report_interval - INTEGER
-	The interval in milliseconds in which the next unsolicited
-	IGMPv3 report retransmit will take place.
-	Default: 1000 (1 seconds)
-
-promote_secondaries - BOOLEAN
-	When a primary IP address is removed from this interface
-	promote a corresponding secondary IP address instead of
-	removing all the corresponding secondary IP addresses.
-
-drop_unicast_in_l2_multicast - BOOLEAN
-	Drop any unicast IP packets that are received in link-layer
-	multicast (or broadcast) frames.
-	This behavior (for multicast) is actually a SHOULD in RFC
-	1122, but is disabled by default for compatibility reasons.
-	Default: off (0)
-
-drop_gratuitous_arp - BOOLEAN
-	Drop all gratuitous ARP frames, for example if there's a known
-	good ARP proxy on the network and such frames need not be used
-	(or in the case of 802.11, must not be used to prevent attacks.)
-	Default: off (0)
-
-
-tag - INTEGER
-	Allows you to write a number, which can be used as required.
-	Default value is 0.
-
-xfrm4_gc_thresh - INTEGER
-	(Obsolete since linux-4.14)
-	The threshold at which we will start garbage collecting for IPv4
-	destination cache entries.  At twice this value the system will
-	refuse new allocations.
-
-igmp_link_local_mcast_reports - BOOLEAN
-	Enable IGMP reports for link local multicast groups in the
-	224.0.0.X range.
-	Default TRUE
-
-Alexey Kuznetsov.
-kuznet@ms2.inr.ac.ru
-
-Updated by:
-Andi Kleen
-ak@muc.de
-Nicolas Delon
-delon.nicolas@wanadoo.fr
-
-
-
-
-/proc/sys/net/ipv6/* Variables:
-
-IPv6 has no global variables such as tcp_*.  tcp_* settings under ipv4/ also
-apply to IPv6 [XXX?].
-
-bindv6only - BOOLEAN
-	Default value for IPV6_V6ONLY socket option,
-	which restricts use of the IPv6 socket to IPv6 communication
-	only.
-		TRUE: disable IPv4-mapped address feature
-		FALSE: enable IPv4-mapped address feature
-
-	Default: FALSE (as specified in RFC3493)
-
-flowlabel_consistency - BOOLEAN
-	Protect the consistency (and unicity) of flow label.
-	You have to disable it to use IPV6_FL_F_REFLECT flag on the
-	flow label manager.
-	TRUE: enabled
-	FALSE: disabled
-	Default: TRUE
-
-auto_flowlabels - INTEGER
-	Automatically generate flow labels based on a flow hash of the
-	packet. This allows intermediate devices, such as routers, to
-	identify packet flows for mechanisms like Equal Cost Multipath
-	Routing (see RFC 6438).
-	0: automatic flow labels are completely disabled
-	1: automatic flow labels are enabled by default, they can be
-	   disabled on a per socket basis using the IPV6_AUTOFLOWLABEL
-	   socket option
-	2: automatic flow labels are allowed, they may be enabled on a
-	   per socket basis using the IPV6_AUTOFLOWLABEL socket option
-	3: automatic flow labels are enabled and enforced, they cannot
-	   be disabled by the socket option
-	Default: 1
-
-flowlabel_state_ranges - BOOLEAN
-	Split the flow label number space into two ranges. 0-0x7FFFF is
-	reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF
-	is reserved for stateless flow labels as described in RFC6437.
-	TRUE: enabled
-	FALSE: disabled
-	Default: true
-
-flowlabel_reflect - INTEGER
-	Control flow label reflection. Needed for Path MTU
-	Discovery to work with Equal Cost Multipath Routing in anycast
-	environments. See RFC 7690 and:
-	https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
-
-	This is a bitmask.
-	1: enabled for established flows
-
-	Note that this prevents automatic flowlabel changes, as done
-	in "tcp: change IPv6 flow-label upon receiving spurious retransmission"
-	and "tcp: Change txhash on every SYN and RTO retransmit"
-
-	2: enabled for TCP RESET packets (no active listener)
-	If set, a RST packet sent in response to a SYN packet on a closed
-	port will reflect the incoming flow label.
-
-	4: enabled for ICMPv6 echo reply messages.
-
-	Default: 0
-
-fib_multipath_hash_policy - INTEGER
-	Controls which hash policy to use for multipath routes.
-	Default: 0 (Layer 3)
-	Possible values:
-	0 - Layer 3 (source and destination addresses plus flow label)
-	1 - Layer 4 (standard 5-tuple)
-	2 - Layer 3 or inner Layer 3 if present
-
-anycast_src_echo_reply - BOOLEAN
-	Controls the use of anycast addresses as source addresses for ICMPv6
-	echo reply
-	TRUE:  enabled
-	FALSE: disabled
-	Default: FALSE
-
-idgen_delay - INTEGER
-	Controls the delay in seconds after which time to retry
-	privacy stable address generation if a DAD conflict is
-	detected.
-	Default: 1 (as specified in RFC7217)
-
-idgen_retries - INTEGER
-	Controls the number of retries to generate a stable privacy
-	address if a DAD conflict is detected.
-	Default: 3 (as specified in RFC7217)
-
-mld_qrv - INTEGER
-	Controls the MLD query robustness variable (see RFC3810 9.1).
-	Default: 2 (as specified by RFC3810 9.1)
-	Minimum: 1 (as specified by RFC6636 4.5)
-
-max_dst_opts_number - INTEGER
-	Maximum number of non-padding TLVs allowed in a Destination
-	options extension header. If this value is less than zero
-	then unknown options are disallowed and the number of known
-	TLVs allowed is the absolute value of this number.
-	Default: 8
-
-max_hbh_opts_number - INTEGER
-	Maximum number of non-padding TLVs allowed in a Hop-by-Hop
-	options extension header. If this value is less than zero
-	then unknown options are disallowed and the number of known
-	TLVs allowed is the absolute value of this number.
-	Default: 8
-
-max_dst_opts_length - INTEGER
-	Maximum length allowed for a Destination options extension
-	header.
-	Default: INT_MAX (unlimited)
-
-max_hbh_length - INTEGER
-	Maximum length allowed for a Hop-by-Hop options extension
-	header.
-	Default: INT_MAX (unlimited)
-
-skip_notify_on_dev_down - BOOLEAN
-	Controls whether an RTM_DELROUTE message is generated for routes
-	removed when a device is taken down or deleted. IPv4 does not
-	generate this message; IPv6 does by default. Setting this sysctl
-	to true skips the message, making IPv4 and IPv6 on par in relying
-	on userspace caches to track link events and evict routes.
-	Default: false (generate message)
-
-nexthop_compat_mode - BOOLEAN
-	New nexthop API provides a means for managing nexthops independent of
-	prefixes. Backwards compatibilty with old route format is enabled by
-	default which means route dumps and notifications contain the new
-	nexthop attribute but also the full, expanded nexthop definition.
-	Further, updates or deletes of a nexthop configuration generate route
-	notifications for each fib entry using the nexthop. Once a system
-	understands the new API, this sysctl can be disabled to achieve full
-	performance benefits of the new API by disabling the nexthop expansion
-	and extraneous notifications.
-	Default: true (backward compat mode)
-
-IPv6 Fragmentation:
-
-ip6frag_high_thresh - INTEGER
-	Maximum memory used to reassemble IPv6 fragments. When
-	ip6frag_high_thresh bytes of memory is allocated for this purpose,
-	the fragment handler will toss packets until ip6frag_low_thresh
-	is reached.
-
-ip6frag_low_thresh - INTEGER
-	See ip6frag_high_thresh
-
-ip6frag_time - INTEGER
-	Time in seconds to keep an IPv6 fragment in memory.
-
-IPv6 Segment Routing:
-
-seg6_flowlabel - INTEGER
-	Controls the behaviour of computing the flowlabel of outer
-	IPv6 header in case of SR T.encaps
-
-	-1 set flowlabel to zero.
-	0 copy flowlabel from Inner packet in case of Inner IPv6
-		(Set flowlabel to 0 in case IPv4/L2)
-	1 Compute the flowlabel using seg6_make_flowlabel()
-
-	Default is 0.
-
-conf/default/*:
-	Change the interface-specific default settings.
-
-
-conf/all/*:
-	Change all the interface-specific settings.
-
-	[XXX:  Other special features than forwarding?]
-
-conf/all/forwarding - BOOLEAN
-	Enable global IPv6 forwarding between all interfaces.
-
-	IPv4 and IPv6 work differently here; e.g. netfilter must be used
-	to control which interfaces may forward packets and which not.
-
-	This also sets all interfaces' Host/Router setting
-	'forwarding' to the specified value.  See below for details.
-
-	This referred to as global forwarding.
-
-proxy_ndp - BOOLEAN
-	Do proxy ndp.
-
-fwmark_reflect - BOOLEAN
-	Controls the fwmark of kernel-generated IPv6 reply packets that are not
-	associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
-	If unset, these packets have a fwmark of zero. If set, they have the
-	fwmark of the packet they are replying to.
-	Default: 0
-
-conf/interface/*:
-	Change special settings per interface.
-
-	The functional behaviour for certain settings is different
-	depending on whether local forwarding is enabled or not.
-
-accept_ra - INTEGER
-	Accept Router Advertisements; autoconfigure using them.
-
-	It also determines whether or not to transmit Router
-	Solicitations. If and only if the functional setting is to
-	accept Router Advertisements, Router Solicitations will be
-	transmitted.
-
-	Possible values are:
-		0 Do not accept Router Advertisements.
-		1 Accept Router Advertisements if forwarding is disabled.
-		2 Overrule forwarding behaviour. Accept Router Advertisements
-		  even if forwarding is enabled.
-
-	Functional default: enabled if local forwarding is disabled.
-			    disabled if local forwarding is enabled.
-
-accept_ra_defrtr - BOOLEAN
-	Learn default router in Router Advertisement.
-
-	Functional default: enabled if accept_ra is enabled.
-			    disabled if accept_ra is disabled.
-
-accept_ra_from_local - BOOLEAN
-	Accept RA with source-address that is found on local machine
-        if the RA is otherwise proper and able to be accepted.
-        Default is to NOT accept these as it may be an un-intended
-        network loop.
-
-	Functional default:
-           enabled if accept_ra_from_local is enabled
-               on a specific interface.
-	   disabled if accept_ra_from_local is disabled
-               on a specific interface.
-
-accept_ra_min_hop_limit - INTEGER
-	Minimum hop limit Information in Router Advertisement.
-
-	Hop limit Information in Router Advertisement less than this
-	variable shall be ignored.
-
-	Default: 1
-
-accept_ra_pinfo - BOOLEAN
-	Learn Prefix Information in Router Advertisement.
-
-	Functional default: enabled if accept_ra is enabled.
-			    disabled if accept_ra is disabled.
-
-accept_ra_rt_info_min_plen - INTEGER
-	Minimum prefix length of Route Information in RA.
-
-	Route Information w/ prefix smaller than this variable shall
-	be ignored.
-
-	Functional default: 0 if accept_ra_rtr_pref is enabled.
-			    -1 if accept_ra_rtr_pref is disabled.
-
-accept_ra_rt_info_max_plen - INTEGER
-	Maximum prefix length of Route Information in RA.
-
-	Route Information w/ prefix larger than this variable shall
-	be ignored.
-
-	Functional default: 0 if accept_ra_rtr_pref is enabled.
-			    -1 if accept_ra_rtr_pref is disabled.
-
-accept_ra_rtr_pref - BOOLEAN
-	Accept Router Preference in RA.
-
-	Functional default: enabled if accept_ra is enabled.
-			    disabled if accept_ra is disabled.
-
-accept_ra_mtu - BOOLEAN
-	Apply the MTU value specified in RA option 5 (RFC4861). If
-	disabled, the MTU specified in the RA will be ignored.
-
-	Functional default: enabled if accept_ra is enabled.
-			    disabled if accept_ra is disabled.
-
-accept_redirects - BOOLEAN
-	Accept Redirects.
-
-	Functional default: enabled if local forwarding is disabled.
-			    disabled if local forwarding is enabled.
-
-accept_source_route - INTEGER
-	Accept source routing (routing extension header).
-
-	>= 0: Accept only routing header type 2.
-	< 0: Do not accept routing header.
-
-	Default: 0
-
-autoconf - BOOLEAN
-	Autoconfigure addresses using Prefix Information in Router
-	Advertisements.
-
-	Functional default: enabled if accept_ra_pinfo is enabled.
-			    disabled if accept_ra_pinfo is disabled.
-
-dad_transmits - INTEGER
-	The amount of Duplicate Address Detection probes to send.
-	Default: 1
-
-forwarding - INTEGER
-	Configure interface-specific Host/Router behaviour.
-
-	Note: It is recommended to have the same setting on all
-	interfaces; mixed router/host scenarios are rather uncommon.
-
-	Possible values are:
-		0 Forwarding disabled
-		1 Forwarding enabled
-
-	FALSE (0):
-
-	By default, Host behaviour is assumed.  This means:
-
-	1. IsRouter flag is not set in Neighbour Advertisements.
-	2. If accept_ra is TRUE (default), transmit Router
-	   Solicitations.
-	3. If accept_ra is TRUE (default), accept Router
-	   Advertisements (and do autoconfiguration).
-	4. If accept_redirects is TRUE (default), accept Redirects.
-
-	TRUE (1):
-
-	If local forwarding is enabled, Router behaviour is assumed.
-	This means exactly the reverse from the above:
-
-	1. IsRouter flag is set in Neighbour Advertisements.
-	2. Router Solicitations are not sent unless accept_ra is 2.
-	3. Router Advertisements are ignored unless accept_ra is 2.
-	4. Redirects are ignored.
-
-	Default: 0 (disabled) if global forwarding is disabled (default),
-		 otherwise 1 (enabled).
-
-hop_limit - INTEGER
-	Default Hop Limit to set.
-	Default: 64
-
-mtu - INTEGER
-	Default Maximum Transfer Unit
-	Default: 1280 (IPv6 required minimum)
-
-ip_nonlocal_bind - BOOLEAN
-	If set, allows processes to bind() to non-local IPv6 addresses,
-	which can be quite useful - but may break some applications.
-	Default: 0
-
-router_probe_interval - INTEGER
-	Minimum interval (in seconds) between Router Probing described
-	in RFC4191.
-
-	Default: 60
-
-router_solicitation_delay - INTEGER
-	Number of seconds to wait after interface is brought up
-	before sending Router Solicitations.
-	Default: 1
-
-router_solicitation_interval - INTEGER
-	Number of seconds to wait between Router Solicitations.
-	Default: 4
-
-router_solicitations - INTEGER
-	Number of Router Solicitations to send until assuming no
-	routers are present.
-	Default: 3
-
-use_oif_addrs_only - BOOLEAN
-	When enabled, the candidate source addresses for destinations
-	routed via this interface are restricted to the set of addresses
-	configured on this interface (vis. RFC 6724, section 4).
-
-	Default: false
-
-use_tempaddr - INTEGER
-	Preference for Privacy Extensions (RFC3041).
-	  <= 0 : disable Privacy Extensions
-	  == 1 : enable Privacy Extensions, but prefer public
-	         addresses over temporary addresses.
-	  >  1 : enable Privacy Extensions and prefer temporary
-	         addresses over public addresses.
-	Default:  0 (for most devices)
-		 -1 (for point-to-point devices and loopback devices)
-
-temp_valid_lft - INTEGER
-	valid lifetime (in seconds) for temporary addresses.
-	Default: 604800 (7 days)
-
-temp_prefered_lft - INTEGER
-	Preferred lifetime (in seconds) for temporary addresses.
-	Default: 86400 (1 day)
-
-keep_addr_on_down - INTEGER
-	Keep all IPv6 addresses on an interface down event. If set static
-	global addresses with no expiration time are not flushed.
-	  >0 : enabled
-	   0 : system default
-	  <0 : disabled
-
-	Default: 0 (addresses are removed)
-
-max_desync_factor - INTEGER
-	Maximum value for DESYNC_FACTOR, which is a random value
-	that ensures that clients don't synchronize with each
-	other and generate new addresses at exactly the same time.
-	value is in seconds.
-	Default: 600
-
-regen_max_retry - INTEGER
-	Number of attempts before give up attempting to generate
-	valid temporary addresses.
-	Default: 5
-
-max_addresses - INTEGER
-	Maximum number of autoconfigured addresses per interface.  Setting
-	to zero disables the limitation.  It is not recommended to set this
-	value too large (or to zero) because it would be an easy way to
-	crash the kernel by allowing too many addresses to be created.
-	Default: 16
-
-disable_ipv6 - BOOLEAN
-	Disable IPv6 operation.  If accept_dad is set to 2, this value
-	will be dynamically set to TRUE if DAD fails for the link-local
-	address.
-	Default: FALSE (enable IPv6 operation)
-
-	When this value is changed from 1 to 0 (IPv6 is being enabled),
-	it will dynamically create a link-local address on the given
-	interface and start Duplicate Address Detection, if necessary.
-
-	When this value is changed from 0 to 1 (IPv6 is being disabled),
-	it will dynamically delete all addresses and routes on the given
-	interface. From now on it will not possible to add addresses/routes
-	to the selected interface.
-
-accept_dad - INTEGER
-	Whether to accept DAD (Duplicate Address Detection).
-	0: Disable DAD
-	1: Enable DAD (default)
-	2: Enable DAD, and disable IPv6 operation if MAC-based duplicate
-	   link-local address has been found.
-
-	DAD operation and mode on a given interface will be selected according
-	to the maximum value of conf/{all,interface}/accept_dad.
-
-force_tllao - BOOLEAN
-	Enable sending the target link-layer address option even when
-	responding to a unicast neighbor solicitation.
-	Default: FALSE
-
-	Quoting from RFC 2461, section 4.4, Target link-layer address:
-
-	"The option MUST be included for multicast solicitations in order to
-	avoid infinite Neighbor Solicitation "recursion" when the peer node
-	does not have a cache entry to return a Neighbor Advertisements
-	message.  When responding to unicast solicitations, the option can be
-	omitted since the sender of the solicitation has the correct link-
-	layer address; otherwise it would not have be able to send the unicast
-	solicitation in the first place. However, including the link-layer
-	address in this case adds little overhead and eliminates a potential
-	race condition where the sender deletes the cached link-layer address
-	prior to receiving a response to a previous solicitation."
-
-ndisc_notify - BOOLEAN
-	Define mode for notification of address and device changes.
-	0 - (default): do nothing
-	1 - Generate unsolicited neighbour advertisements when device is brought
-	    up or hardware address changes.
-
-ndisc_tclass - INTEGER
-	The IPv6 Traffic Class to use by default when sending IPv6 Neighbor
-	Discovery (Router Solicitation, Router Advertisement, Neighbor
-	Solicitation, Neighbor Advertisement, Redirect) messages.
-	These 8 bits can be interpreted as 6 high order bits holding the DSCP
-	value and 2 low order bits representing ECN (which you probably want
-	to leave cleared).
-	0 - (default)
-
-mldv1_unsolicited_report_interval - INTEGER
-	The interval in milliseconds in which the next unsolicited
-	MLDv1 report retransmit will take place.
-	Default: 10000 (10 seconds)
-
-mldv2_unsolicited_report_interval - INTEGER
-	The interval in milliseconds in which the next unsolicited
-	MLDv2 report retransmit will take place.
-	Default: 1000 (1 second)
-
-force_mld_version - INTEGER
-	0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed
-	1 - Enforce to use MLD version 1
-	2 - Enforce to use MLD version 2
-
-suppress_frag_ndisc - INTEGER
-	Control RFC 6980 (Security Implications of IPv6 Fragmentation
-	with IPv6 Neighbor Discovery) behavior:
-	1 - (default) discard fragmented neighbor discovery packets
-	0 - allow fragmented neighbor discovery packets
-
-optimistic_dad - BOOLEAN
-	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
-	0: disabled (default)
-	1: enabled
-
-	Optimistic Duplicate Address Detection for the interface will be enabled
-	if at least one of conf/{all,interface}/optimistic_dad is set to 1,
-	it will be disabled otherwise.
-
-use_optimistic - BOOLEAN
-	If enabled, do not classify optimistic addresses as deprecated during
-	source address selection.  Preferred addresses will still be chosen
-	before optimistic addresses, subject to other ranking in the source
-	address selection algorithm.
-	0: disabled (default)
-	1: enabled
-
-	This will be enabled if at least one of
-	conf/{all,interface}/use_optimistic is set to 1, disabled otherwise.
-
-stable_secret - IPv6 address
-	This IPv6 address will be used as a secret to generate IPv6
-	addresses for link-local addresses and autoconfigured
-	ones. All addresses generated after setting this secret will
-	be stable privacy ones by default. This can be changed via the
-	addrgenmode ip-link. conf/default/stable_secret is used as the
-	secret for the namespace, the interface specific ones can
-	overwrite that. Writes to conf/all/stable_secret are refused.
-
-	It is recommended to generate this secret during installation
-	of a system and keep it stable after that.
-
-	By default the stable secret is unset.
-
-addr_gen_mode - INTEGER
-	Defines how link-local and autoconf addresses are generated.
-
-	0: generate address based on EUI64 (default)
-	1: do no generate a link-local address, use EUI64 for addresses generated
-	   from autoconf
-	2: generate stable privacy addresses, using the secret from
-	   stable_secret (RFC7217)
-	3: generate stable privacy addresses, using a random secret if unset
-
-drop_unicast_in_l2_multicast - BOOLEAN
-	Drop any unicast IPv6 packets that are received in link-layer
-	multicast (or broadcast) frames.
-
-	By default this is turned off.
-
-drop_unsolicited_na - BOOLEAN
-	Drop all unsolicited neighbor advertisements, for example if there's
-	a known good NA proxy on the network and such frames need not be used
-	(or in the case of 802.11, must not be used to prevent attacks.)
-
-	By default this is turned off.
-
-enhanced_dad - BOOLEAN
-	Include a nonce option in the IPv6 neighbor solicitation messages used for
-	duplicate address detection per RFC7527. A received DAD NS will only signal
-	a duplicate address if the nonce is different. This avoids any false
-	detection of duplicates due to loopback of the NS messages that we send.
-	The nonce option will be sent on an interface unless both of
-	conf/{all,interface}/enhanced_dad are set to FALSE.
-	Default: TRUE
-
-icmp/*:
-ratelimit - INTEGER
-	Limit the maximal rates for sending ICMPv6 messages.
-	0 to disable any limiting,
-	otherwise the minimal space between responses in milliseconds.
-	Default: 1000
-
-ratemask - list of comma separated ranges
-	For ICMPv6 message types matching the ranges in the ratemask, limit
-	the sending of the message according to ratelimit parameter.
-
-	The format used for both input and output is a comma separated
-	list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and
-	129). Writing to the file will clear all previous ranges of ICMPv6
-	message types and update the current list with the input.
-
-	Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml
-	for numerical values of ICMPv6 message types, e.g. echo request is 128
-	and echo reply is 129.
-
-	Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big)
-
-echo_ignore_all - BOOLEAN
-	If set non-zero, then the kernel will ignore all ICMP ECHO
-	requests sent to it over the IPv6 protocol.
-	Default: 0
-
-echo_ignore_multicast - BOOLEAN
-	If set non-zero, then the kernel will ignore all ICMP ECHO
-	requests sent to it over the IPv6 protocol via multicast.
-	Default: 0
-
-echo_ignore_anycast - BOOLEAN
-	If set non-zero, then the kernel will ignore all ICMP ECHO
-	requests sent to it over the IPv6 protocol destined to anycast address.
-	Default: 0
-
-xfrm6_gc_thresh - INTEGER
-	(Obsolete since linux-4.14)
-	The threshold at which we will start garbage collecting for IPv6
-	destination cache entries.  At twice this value the system will
-	refuse new allocations.
-
-
-IPv6 Update by:
-Pekka Savola <pekkas@netcore.fi>
-YOSHIFUJI Hideaki / USAGI Project <yoshfuji@linux-ipv6.org>
-
-
-/proc/sys/net/bridge/* Variables:
-
-bridge-nf-call-arptables - BOOLEAN
-	1 : pass bridged ARP traffic to arptables' FORWARD chain.
-	0 : disable this.
-	Default: 1
-
-bridge-nf-call-iptables - BOOLEAN
-	1 : pass bridged IPv4 traffic to iptables' chains.
-	0 : disable this.
-	Default: 1
-
-bridge-nf-call-ip6tables - BOOLEAN
-	1 : pass bridged IPv6 traffic to ip6tables' chains.
-	0 : disable this.
-	Default: 1
-
-bridge-nf-filter-vlan-tagged - BOOLEAN
-	1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables.
-	0 : disable this.
-	Default: 0
-
-bridge-nf-filter-pppoe-tagged - BOOLEAN
-	1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables.
-	0 : disable this.
-	Default: 0
-
-bridge-nf-pass-vlan-input-dev - BOOLEAN
-	1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan
-	interface on the bridge and set the netfilter input device to the vlan.
-	This allows use of e.g. "iptables -i br0.1" and makes the REDIRECT
-	target work with vlan-on-top-of-bridge interfaces.  When no matching
-	vlan interface is found, or this switch is off, the input device is
-	set to the bridge interface.
-	0: disable bridge netfilter vlan interface lookup.
-	Default: 0
-
-proc/sys/net/sctp/* Variables:
-
-addip_enable - BOOLEAN
-	Enable or disable extension of  Dynamic Address Reconfiguration
-	(ADD-IP) functionality specified in RFC5061.  This extension provides
-	the ability to dynamically add and remove new addresses for the SCTP
-	associations.
-
-	1: Enable extension.
-
-	0: Disable extension.
-
-	Default: 0
-
-pf_enable - INTEGER
-	Enable or disable pf (pf is short for potentially failed) state. A value
-	of pf_retrans > path_max_retrans also disables pf state. That is, one of
-	both pf_enable and pf_retrans > path_max_retrans can disable pf state.
-	Since pf_retrans and path_max_retrans can be changed by userspace
-	application, sometimes user expects to disable pf state by the value of
-	pf_retrans > path_max_retrans, but occasionally the value of pf_retrans
-	or path_max_retrans is changed by the user application, this pf state is
-	enabled. As such, it is necessary to add this to dynamically enable
-	and disable pf state. See:
-	https://datatracker.ietf.org/doc/draft-ietf-tsvwg-sctp-failover for
-	details.
-
-	1: Enable pf.
-
-	0: Disable pf.
-
-	Default: 1
-
-pf_expose - INTEGER
-	Unset or enable/disable pf (pf is short for potentially failed) state
-	exposure.  Applications can control the exposure of the PF path state
-	in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO
-	sockopt.   When it's unset, no SCTP_PEER_ADDR_CHANGE event with
-	SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info
-	can be got via SCTP_GET_PEER_ADDR_INFO sockopt;  When it's enabled,
-	a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming
-	SCTP_PF state and a SCTP_PF-state transport info can be got via
-	SCTP_GET_PEER_ADDR_INFO sockopt;  When it's diabled, no
-	SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when
-	trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO
-	sockopt.
-
-	0: Unset pf state exposure, Compatible with old applications.
-
-	1: Disable pf state exposure.
-
-	2: Enable pf state exposure.
-
-	Default: 0
-
-addip_noauth_enable - BOOLEAN
-	Dynamic Address Reconfiguration (ADD-IP) requires the use of
-	authentication to protect the operations of adding or removing new
-	addresses.  This requirement is mandated so that unauthorized hosts
-	would not be able to hijack associations.  However, older
-	implementations may not have implemented this requirement while
-	allowing the ADD-IP extension.  For reasons of interoperability,
-	we provide this variable to control the enforcement of the
-	authentication requirement.
-
-	1: Allow ADD-IP extension to be used without authentication.  This
-	   should only be set in a closed environment for interoperability
-	   with older implementations.
-
-	0: Enforce the authentication requirement
-
-	Default: 0
-
-auth_enable - BOOLEAN
-	Enable or disable Authenticated Chunks extension.  This extension
-	provides the ability to send and receive authenticated chunks and is
-	required for secure operation of Dynamic Address Reconfiguration
-	(ADD-IP) extension.
-
-	1: Enable this extension.
-	0: Disable this extension.
-
-	Default: 0
-
-prsctp_enable - BOOLEAN
-	Enable or disable the Partial Reliability extension (RFC3758) which
-	is used to notify peers that a given DATA should no longer be expected.
-
-	1: Enable extension
-	0: Disable
-
-	Default: 1
-
-max_burst - INTEGER
-	The limit of the number of new packets that can be initially sent.  It
-	controls how bursty the generated traffic can be.
-
-	Default: 4
-
-association_max_retrans - INTEGER
-	Set the maximum number for retransmissions that an association can
-	attempt deciding that the remote end is unreachable.  If this value
-	is exceeded, the association is terminated.
-
-	Default: 10
-
-max_init_retransmits - INTEGER
-	The maximum number of retransmissions of INIT and COOKIE-ECHO chunks
-	that an association will attempt before declaring the destination
-	unreachable and terminating.
-
-	Default: 8
-
-path_max_retrans - INTEGER
-	The maximum number of retransmissions that will be attempted on a given
-	path.  Once this threshold is exceeded, the path is considered
-	unreachable, and new traffic will use a different path when the
-	association is multihomed.
-
-	Default: 5
-
-pf_retrans - INTEGER
-	The number of retransmissions that will be attempted on a given path
-	before traffic is redirected to an alternate transport (should one
-	exist).  Note this is distinct from path_max_retrans, as a path that
-	passes the pf_retrans threshold can still be used.  Its only
-	deprioritized when a transmission path is selected by the stack.  This
-	setting is primarily used to enable fast failover mechanisms without
-	having to reduce path_max_retrans to a very low value.  See:
-	http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
-	for details.  Note also that a value of pf_retrans > path_max_retrans
-	disables this feature. Since both pf_retrans and path_max_retrans can
-	be changed by userspace application, a variable pf_enable is used to
-	disable pf state.
-
-	Default: 0
-
-ps_retrans - INTEGER
-	Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming
-	from section-5 "Primary Path Switchover" in rfc7829.  The primary path
-	will be changed to another active path when the path error counter on
-	the old primary path exceeds PSMR, so that "the SCTP sender is allowed
-	to continue data transmission on a new working path even when the old
-	primary destination address becomes active again".   Note this feature
-	is disabled by initializing 'ps_retrans' per netns as 0xffff by default,
-	and its value can't be less than 'pf_retrans' when changing by sysctl.
-
-	Default: 0xffff
-
-rto_initial - INTEGER
-	The initial round trip timeout value in milliseconds that will be used
-	in calculating round trip times.  This is the initial time interval
-	for retransmissions.
-
-	Default: 3000
-
-rto_max - INTEGER
-	The maximum value (in milliseconds) of the round trip timeout.  This
-	is the largest time interval that can elapse between retransmissions.
-
-	Default: 60000
-
-rto_min - INTEGER
-	The minimum value (in milliseconds) of the round trip timeout.  This
-	is the smallest time interval the can elapse between retransmissions.
-
-	Default: 1000
-
-hb_interval - INTEGER
-	The interval (in milliseconds) between HEARTBEAT chunks.  These chunks
-	are sent at the specified interval on idle paths to probe the state of
-	a given path between 2 associations.
-
-	Default: 30000
-
-sack_timeout - INTEGER
-	The amount of time (in milliseconds) that the implementation will wait
-	to send a SACK.
-
-	Default: 200
-
-valid_cookie_life - INTEGER
-	The default lifetime of the SCTP cookie (in milliseconds).  The cookie
-	is used during association establishment.
-
-	Default: 60000
-
-cookie_preserve_enable - BOOLEAN
-	Enable or disable the ability to extend the lifetime of the SCTP cookie
-	that is used during the establishment phase of SCTP association
-
-	1: Enable cookie lifetime extension.
-	0: Disable
-
-	Default: 1
-
-cookie_hmac_alg - STRING
-	Select the hmac algorithm used when generating the cookie value sent by
-	a listening sctp socket to a connecting client in the INIT-ACK chunk.
-	Valid values are:
-	* md5
-	* sha1
-	* none
-	Ability to assign md5 or sha1 as the selected alg is predicated on the
-	configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and
-	CONFIG_CRYPTO_SHA1).
-
-	Default: Dependent on configuration.  MD5 if available, else SHA1 if
-	available, else none.
-
-rcvbuf_policy - INTEGER
-	Determines if the receive buffer is attributed to the socket or to
-	association.   SCTP supports the capability to create multiple
-	associations on a single socket.  When using this capability, it is
-	possible that a single stalled association that's buffering a lot
-	of data may block other associations from delivering their data by
-	consuming all of the receive buffer space.  To work around this,
-	the rcvbuf_policy could be set to attribute the receiver buffer space
-	to each association instead of the socket.  This prevents the described
-	blocking.
-
-	1: rcvbuf space is per association
-	0: rcvbuf space is per socket
-
-	Default: 0
-
-sndbuf_policy - INTEGER
-	Similar to rcvbuf_policy above, this applies to send buffer space.
-
-	1: Send buffer is tracked per association
-	0: Send buffer is tracked per socket.
-
-	Default: 0
-
-sctp_mem - vector of 3 INTEGERs: min, pressure, max
-	Number of pages allowed for queueing by all SCTP sockets.
-
-	min: Below this number of pages SCTP is not bothered about its
-	memory appetite. When amount of memory allocated by SCTP exceeds
-	this number, SCTP starts to moderate memory usage.
-
-	pressure: This value was introduced to follow format of tcp_mem.
-
-	max: Number of pages allowed for queueing by all SCTP sockets.
-
-	Default is calculated at boot time from amount of available memory.
-
-sctp_rmem - vector of 3 INTEGERs: min, default, max
-	Only the first value ("min") is used, "default" and "max" are
-	ignored.
-
-	min: Minimal size of receive buffer used by SCTP socket.
-	It is guaranteed to each SCTP socket (but not association) even
-	under moderate memory pressure.
-
-	Default: 4K
-
-sctp_wmem  - vector of 3 INTEGERs: min, default, max
-	Currently this tunable has no effect.
-
-addr_scope_policy - INTEGER
-	Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
-
-	0   - Disable IPv4 address scoping
-	1   - Enable IPv4 address scoping
-	2   - Follow draft but allow IPv4 private addresses
-	3   - Follow draft but allow IPv4 link local addresses
-
-	Default: 1
-
-
-/proc/sys/net/core/*
-	Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries.
-
-
-/proc/sys/net/unix/*
-max_dgram_qlen - INTEGER
-	The maximum length of dgram socket receive queue
-
-	Default: 10
-
diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst
index 10e11099e74a..4edd0d38779e 100644
--- a/Documentation/networking/snmp_counter.rst
+++ b/Documentation/networking/snmp_counter.rst
@@ -792,7 +792,7 @@ counters to indicate the ACK is skipped in which scenario. The ACK
 would only be skipped if the received packet is either a SYN packet or
 it has no data.
 
-.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
+.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.rst
 
 * TcpExtTCPACKSkippedSynRecv
 
diff --git a/net/Kconfig b/net/Kconfig
index df8d8c9bd021..8b1f85820a6b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -86,7 +86,7 @@ config INET
 	  "Sysctl support" below, you can change various aspects of the
 	  behavior of the TCP/IP code by writing to the (virtual) files in
 	  /proc/sys/net/ipv4/*; the options are explained in the file
-	  <file:Documentation/networking/ip-sysctl.txt>.
+	  <file:Documentation/networking/ip-sysctl.rst>.
 
 	  Short answer: say Y.
 
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 25a8888826b8..5da4733067fb 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -49,7 +49,7 @@ config IP_ADVANCED_ROUTER
 
 	  Note that some distributions enable it in startup scripts.
 	  For details about rp_filter strict and loose mode read
-	  <file:Documentation/networking/ip-sysctl.txt>.
+	  <file:Documentation/networking/ip-sysctl.rst>.
 
 	  If unsure, say N here.
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fc61f51d87a3..956a806649f7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -853,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb)
 		case ICMP_FRAG_NEEDED:
 			/* for documentation of the ip_no_pmtu_disc
 			 * values please see
-			 * Documentation/networking/ip-sysctl.txt
+			 * Documentation/networking/ip-sysctl.rst
 			 */
 			switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
 			default:
-- 
cgit v1.2.3


From 19093313cb0486d568232934bb80dd422d891623 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 28 Apr 2020 00:01:50 +0200
Subject: docs: networking: convert ipv6.txt to ReST

Not much to be done here:

- add SPDX header;
- add a document title;
- mark a literal as such, in order to avoid a warning;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +-
 Documentation/networking/index.rst              |  1 +
 Documentation/networking/ipv6.rst               | 78 +++++++++++++++++++++++++
 Documentation/networking/ipv6.txt               | 72 -----------------------
 net/ipv6/Kconfig                                |  2 +-
 5 files changed, 83 insertions(+), 76 deletions(-)
 create mode 100644 Documentation/networking/ipv6.rst
 delete mode 100644 Documentation/networking/ipv6.txt

(limited to 'net')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e37db6f1be64..e43f2e1f2958 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -356,7 +356,7 @@
 			      shot down by NMI
 
 	autoconf=	[IPV6]
-			See Documentation/networking/ipv6.txt.
+			See Documentation/networking/ipv6.rst.
 
 	show_lapic=	[APIC,X86] Advanced Programmable Interrupt Controller
 			Limit apic dumping. The parameter defines the maximal
@@ -872,7 +872,7 @@
 			miss to occur.
 
 	disable=	[IPV6]
-			See Documentation/networking/ipv6.txt.
+			See Documentation/networking/ipv6.rst.
 
 	hardened_usercopy=
                         [KNL] Under CONFIG_HARDENED_USERCOPY, whether
@@ -912,7 +912,7 @@
 			to workaround buggy firmware.
 
 	disable_ipv6=	[IPV6]
-			See Documentation/networking/ipv6.txt.
+			See Documentation/networking/ipv6.rst.
 
 	disable_mtrr_cleanup [X86]
 			The kernel tries to adjust MTRR layout from continuous
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 7d133d8dbe2a..709675464e51 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -70,6 +70,7 @@ Contents:
    iphase
    ipsec
    ip-sysctl
+   ipv6
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/ipv6.rst b/Documentation/networking/ipv6.rst
new file mode 100644
index 000000000000..ba09c2f2dcc7
--- /dev/null
+++ b/Documentation/networking/ipv6.rst
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+IPv6
+====
+
+
+Options for the ipv6 module are supplied as parameters at load time.
+
+Module options may be given as command line arguments to the insmod
+or modprobe command, but are usually specified in either
+``/etc/modules.d/*.conf`` configuration files, or in a distro-specific
+configuration file.
+
+The available ipv6 module parameters are listed below.  If a parameter
+is not specified the default value is used.
+
+The parameters are as follows:
+
+disable
+
+	Specifies whether to load the IPv6 module, but disable all
+	its functionality.  This might be used when another module
+	has a dependency on the IPv6 module being loaded, but no
+	IPv6 addresses or operations are desired.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 is enabled.
+
+		This is the default value.
+
+	1
+		IPv6 is disabled.
+
+		No IPv6 addresses will be added to interfaces, and
+		it will not be possible to open an IPv6 socket.
+
+		A reboot is required to enable IPv6.
+
+autoconf
+
+	Specifies whether to enable IPv6 address autoconfiguration
+	on all interfaces.  This might be used when one does not wish
+	for addresses to be automatically generated from prefixes
+	received in Router Advertisements.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 address autoconfiguration is disabled on all interfaces.
+
+		Only the IPv6 loopback address (::1) and link-local addresses
+		will be added to interfaces.
+
+	1
+		IPv6 address autoconfiguration is enabled on all interfaces.
+
+		This is the default value.
+
+disable_ipv6
+
+	Specifies whether to disable IPv6 on all interfaces.
+	This might be used when no IPv6 addresses are desired.
+
+	The possible values and their effects are:
+
+	0
+		IPv6 is enabled on all interfaces.
+
+		This is the default value.
+
+	1
+		IPv6 is disabled on all interfaces.
+
+		No IPv6 addresses will be added to interfaces.
+
diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
deleted file mode 100644
index 6cd74fa55358..000000000000
--- a/Documentation/networking/ipv6.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-
-Options for the ipv6 module are supplied as parameters at load time.
-
-Module options may be given as command line arguments to the insmod
-or modprobe command, but are usually specified in either
-/etc/modules.d/*.conf configuration files, or in a distro-specific
-configuration file.
-
-The available ipv6 module parameters are listed below.  If a parameter
-is not specified the default value is used.
-
-The parameters are as follows:
-
-disable
-
-	Specifies whether to load the IPv6 module, but disable all
-	its functionality.  This might be used when another module
-	has a dependency on the IPv6 module being loaded, but no
-	IPv6 addresses or operations are desired.
-
-	The possible values and their effects are:
-
-	0
-		IPv6 is enabled.
-
-		This is the default value.
-
-	1
-		IPv6 is disabled.
-
-		No IPv6 addresses will be added to interfaces, and
-		it will not be possible to open an IPv6 socket.
-
-		A reboot is required to enable IPv6.
-
-autoconf
-
-	Specifies whether to enable IPv6 address autoconfiguration
-	on all interfaces.  This might be used when one does not wish
-	for addresses to be automatically generated from prefixes
-	received in Router Advertisements.
-
-	The possible values and their effects are:
-
-	0
-		IPv6 address autoconfiguration is disabled on all interfaces.
-
-		Only the IPv6 loopback address (::1) and link-local addresses
-		will be added to interfaces.
-
-	1
-		IPv6 address autoconfiguration is enabled on all interfaces.
-
-		This is the default value.
-
-disable_ipv6
-
-	Specifies whether to disable IPv6 on all interfaces.
-	This might be used when no IPv6 addresses are desired.
-
-	The possible values and their effects are:
-
-	0
-		IPv6 is enabled on all interfaces.
-
-		This is the default value.
-
-	1
-		IPv6 is disabled on all interfaces.
-
-		No IPv6 addresses will be added to interfaces.
-
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2ccaee98fddb..5a6111da26c4 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -13,7 +13,7 @@ menuconfig IPV6
 	  For general information about IPv6, see
 	  <https://en.wikipedia.org/wiki/IPv6>.
 	  For specific information about IPv6 under Linux, see
-	  Documentation/networking/ipv6.txt and read the HOWTO at
+	  Documentation/networking/ipv6.rst and read the HOWTO at
 	  <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
 
 	  To compile this protocol support as a module, choose M here: the
-- 
cgit v1.2.3


From 08fad438bed0ada1a3308987862327286fcbb5f5 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@codeaurora.org>
Date: Sat, 25 Apr 2020 18:57:12 +0300
Subject: mac80211: TX legacy rate control for Beacon frames

Use the Beacon frame specific legacy rate configuration, if specified
for AP or mesh, instead of the generic rate mask when selecting the TX
rate for Beacon frames.

Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Link: https://lore.kernel.org/r/20200425155713.25687-4-jouni@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         | 26 +++++++++++++++++++++++++-
 net/mac80211/ieee80211_i.h |  4 ++++
 net/mac80211/mesh.c        |  1 +
 net/mac80211/tx.c          |  5 ++++-
 4 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index ae3e06375a28..548a384b0509 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -994,7 +994,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 		      BSS_CHANGED_TWT |
 		      BSS_CHANGED_HE_OBSS_PD |
 		      BSS_CHANGED_HE_BSS_COLOR;
-	int err;
+	int i, err;
 	int prev_beacon_int;
 
 	old = sdata_dereference(sdata->u.ap.beacon, sdata);
@@ -1085,6 +1085,17 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 		sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
 					IEEE80211_P2P_OPPPS_ENABLE_BIT;
 
+	sdata->beacon_rate_set = false;
+	if (wiphy_ext_feature_isset(local->hw.wiphy,
+				    NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) {
+		for (i = 0; i < NUM_NL80211_BANDS; i++) {
+			sdata->beacon_rateidx_mask[i] =
+				params->beacon_rate.control[i].legacy;
+			if (sdata->beacon_rateidx_mask[i])
+				sdata->beacon_rate_set = true;
+		}
+	}
+
 	err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
 	if (err < 0) {
 		ieee80211_vif_release_channel(sdata);
@@ -1189,6 +1200,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
 	ieee80211_free_keys(sdata, true);
 
 	sdata->vif.bss_conf.enable_beacon = false;
+	sdata->beacon_rate_set = false;
 	sdata->vif.bss_conf.ssid_len = 0;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
@@ -1949,6 +1961,7 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
 	const u8 *old_ie;
 	struct ieee80211_sub_if_data *sdata = container_of(ifmsh,
 					struct ieee80211_sub_if_data, u.mesh);
+	int i;
 
 	/* allocate information elements */
 	new_ie = NULL;
@@ -1987,6 +2000,17 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
 	sdata->vif.bss_conf.beacon_int = setup->beacon_interval;
 	sdata->vif.bss_conf.dtim_period = setup->dtim_period;
 
+	sdata->beacon_rate_set = false;
+	if (wiphy_ext_feature_isset(sdata->local->hw.wiphy,
+				    NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) {
+		for (i = 0; i < NUM_NL80211_BANDS; i++) {
+			sdata->beacon_rateidx_mask[i] =
+				setup->beacon_rate.control[i].legacy;
+			if (sdata->beacon_rateidx_mask[i])
+				sdata->beacon_rate_set = true;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9407cf44305c..8cbae66b5cdb 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -962,6 +962,10 @@ struct ieee80211_sub_if_data {
 	bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS];
 	u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX];
 
+	/* Beacon frame (non-MCS) rate (as a bitmap) */
+	u32 beacon_rateidx_mask[NUM_NL80211_BANDS];
+	bool beacon_rate_set;
+
 	union {
 		struct ieee80211_if_ap ap;
 		struct ieee80211_if_wds wds;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 36978a0e5000..5930d07b1e43 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -994,6 +994,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
 	/* stop the beacon */
 	ifmsh->mesh_id_len = 0;
 	sdata->vif.bss_conf.enable_beacon = false;
+	sdata->beacon_rate_set = false;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3dc1990e15c5..6dad67eb60b2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4883,7 +4883,10 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 	txrc.bss_conf = &sdata->vif.bss_conf;
 	txrc.skb = skb;
 	txrc.reported_rate.idx = -1;
-	txrc.rate_idx_mask = sdata->rc_rateidx_mask[band];
+	if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band])
+		txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band];
+	else
+		txrc.rate_idx_mask = sdata->rc_rateidx_mask[band];
 	txrc.bss = true;
 	rate_control_get_rate(sdata, NULL, &txrc);
 
-- 
cgit v1.2.3


From 60689de46c7f6a0028c8b37b6f03db68cbfad8ed Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Fri, 24 Apr 2020 15:41:39 -0700
Subject: mac80211: fix memory overlap due to variable length param

As of now HE operation element in bss_conf includes variable length
optional field followed by other HE variable. Though the optional
field never be used, actually it is referring to next member of the
bss_conf structure which is not correct. Fix it by declaring needed
HE operation fields within bss_conf itself.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1587768108-25248-2-git-send-email-rmanohar@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/mac.c |  3 +--
 include/net/mac80211.h                |  7 +++++--
 net/mac80211/he.c                     | 13 +++++--------
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 9f8bc19cc5ae..06d063274eea 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1168,8 +1168,7 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar,
 	       sizeof(arg->peer_he_cap_macinfo));
 	memcpy(&arg->peer_he_cap_phyinfo, he_cap->he_cap_elem.phy_cap_info,
 	       sizeof(arg->peer_he_cap_phyinfo));
-	memcpy(&arg->peer_he_ops, &vif->bss_conf.he_operation,
-	       sizeof(arg->peer_he_ops));
+	arg->peer_he_ops = vif->bss_conf.he_oper.params;
 
 	/* the top most byte is used to indicate BSS color info */
 	arg->peer_he_ops &= 0xffffff;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ecb219e3ec4f..78f7ce586287 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -604,7 +604,7 @@ struct ieee80211_ftm_responder_params {
  *	nontransmitted BSSIDs
  * @profile_periodicity: the least number of beacon frames need to be received
  *	in order to discover all the nontransmitted BSSIDs in the set.
- * @he_operation: HE operation information of the AP we are connected to
+ * @he_oper: HE operation information of the AP we are connected to
  * @he_obss_pd: OBSS Packet Detection parameters.
  * @he_bss_color: BSS coloring settings, if BSS supports HE
  */
@@ -668,7 +668,10 @@ struct ieee80211_bss_conf {
 	u8 bssid_indicator;
 	bool ema_ap;
 	u8 profile_periodicity;
-	struct ieee80211_he_operation he_operation;
+	struct {
+		u32 params;
+		u16 nss_set;
+	} he_oper;
 	struct ieee80211_he_obss_pd he_obss_pd;
 	struct cfg80211_he_bss_color he_bss_color;
 };
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index 1087f715338b..f520552b22be 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -57,17 +57,14 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 
 void
 ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
-			const struct ieee80211_he_operation *he_op_ie_elem)
+			const struct ieee80211_he_operation *he_op_ie)
 {
-	struct ieee80211_he_operation *he_operation =
-					&vif->bss_conf.he_operation;
-
-	if (!he_op_ie_elem) {
-		memset(he_operation, 0, sizeof(*he_operation));
+	memset(&vif->bss_conf.he_oper, 0, sizeof(vif->bss_conf.he_oper));
+	if (!he_op_ie)
 		return;
-	}
 
-	vif->bss_conf.he_operation = *he_op_ie_elem;
+	vif->bss_conf.he_oper.params = __le32_to_cpu(he_op_ie->he_oper_params);
+	vif->bss_conf.he_oper.nss_set = __le16_to_cpu(he_op_ie->he_mcs_nss_set);
 }
 
 void
-- 
cgit v1.2.3


From a54776f2c4939bdee084c9ecd00a4a5a25b7c429 Mon Sep 17 00:00:00 2001
From: Yunjian Wang <wangyunjian@huawei.com>
Date: Wed, 29 Apr 2020 18:20:58 +0800
Subject: netpoll: Fix use correct return type for ndo_start_xmit()

The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix
the ndo function to use the correct type.

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 849380a622ef..15b366a1a958 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644);
 #define np_notice(np, fmt, ...)				\
 	pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
 
-static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			      struct netdev_queue *txq)
+static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
+				      struct net_device *dev,
+				      struct netdev_queue *txq)
 {
-	int status = NETDEV_TX_OK;
+	netdev_tx_t status = NETDEV_TX_OK;
 	netdev_features_t features;
 
 	features = netif_skb_features(skb);
@@ -307,7 +308,7 @@ static int netpoll_owner_active(struct net_device *dev)
 void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 			     struct net_device *dev)
 {
-	int status = NETDEV_TX_BUSY;
+	netdev_tx_t status = NETDEV_TX_BUSY;
 	unsigned long tries;
 	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo;
-- 
cgit v1.2.3


From ad56623119fda623ade9bc570294a1c2b203d374 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 29 Apr 2020 21:24:30 +0800
Subject: net: hsr: remove unused inline functions

There's no callers in-tree anymore.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_main.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'net')

diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 7321cf8d6d2c..f74193465bf5 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -62,15 +62,6 @@ struct hsr_tag {
  * with the path field in-between, which seems strange. I'm guessing the MAC
  * address definition is in error.
  */
-static inline u16 get_hsr_tag_path(struct hsr_tag *ht)
-{
-	return ntohs(ht->path_and_LSDU_size) >> 12;
-}
-
-static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht)
-{
-	return ntohs(ht->path_and_LSDU_size) & 0x0FFF;
-}
 
 static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
 {
@@ -103,16 +94,6 @@ struct hsr_sup_payload {
 	unsigned char	macaddress_A[ETH_ALEN];
 } __packed;
 
-static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst)
-{
-	return get_hsr_tag_path((struct hsr_tag *)hst);
-}
-
-static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst)
-{
-	return get_hsr_tag_LSDU_size((struct hsr_tag *)hst);
-}
-
 static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
 {
 	set_hsr_tag_path((struct hsr_tag *)hst, path);
-- 
cgit v1.2.3


From 0477e032a9ea34780180d9d5289f9da06714de43 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 29 Apr 2020 21:25:30 +0800
Subject: ila: remove unused inline function ila_addr_is_ila

There's no callers in-tree anymore since commit 84287bb32856 ("ila: add
checksum neutral map auto").

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ila/ila.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index bb6fc0d54dae..ad5f6f6ba333 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr)
 	return (struct ila_addr *)addr;
 }
 
-static inline bool ila_addr_is_ila(struct ila_addr *iaddr)
-{
-	return (iaddr->ident.type != ILA_ATYPE_IID);
-}
-
 struct ila_params {
 	struct ila_locator locator;
 	struct ila_locator locator_match;
-- 
cgit v1.2.3


From fdff704dc60418e9a1bac78ae09c857d05c65aa3 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:37 +0200
Subject: net/smc: rework pnet table to support SMC-R failover

The pnet table stored pnet ids in the smc device structures. When a
device is going down its smc device structure is freed, and when the
device is brought online again it no longer has a pnet id set.
Rework the pnet table implementation to store the device name with their
assigned pnet id and apply the pnet id to devices when they are
registered.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_ib.c   |   5 +-
 net/smc/smc_ism.c  |   3 +-
 net/smc/smc_pnet.c | 539 +++++++++++++++++++++++++++++++----------------------
 net/smc/smc_pnet.h |   2 +
 4 files changed, 319 insertions(+), 230 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 04b6fefb8bce..440f9e319a38 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -579,8 +579,9 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
 	     i++) {
 		set_bit(i, &smcibdev->port_event_mask);
 		/* determine pnetids of the port */
-		smc_pnetid_by_dev_port(ibdev->dev.parent, i,
-				       smcibdev->pnetid[i]);
+		if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+					   smcibdev->pnetid[i]))
+			smc_pnetid_by_table_ib(smcibdev, i + 1);
 	}
 	schedule_work(&smcibdev->port_event_work);
 }
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 5c4727d5066e..32be2da2cb85 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
 	device_initialize(&smcd->dev);
 	dev_set_name(&smcd->dev, name);
 	smcd->ops = ops;
-	smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
+	if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid))
+		smc_pnetid_by_table_smcd(smcd);
 
 	spin_lock_init(&smcd->lock);
 	spin_lock_init(&smcd->lgr_lock);
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 2a5ed47c3e08..bd01c71b827a 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
 
 static struct genl_family smc_pnet_nl_family;
 
-/**
- * struct smc_user_pnetentry - pnet identifier name entry for/from user
- * @list: List node.
- * @pnet_name: Pnet identifier name
- * @ndev: pointer to network device.
- * @smcibdev: Pointer to IB device.
- * @ib_port: Port of IB device.
- * @smcd_dev: Pointer to smcd device.
- */
-struct smc_user_pnetentry {
-	struct list_head list;
-	char pnet_name[SMC_MAX_PNETID_LEN + 1];
-	struct net_device *ndev;
-	struct smc_ib_device *smcibdev;
-	u8 ib_port;
-	struct smcd_dev *smcd_dev;
+enum smc_pnet_nametype {
+	SMC_PNET_ETH	= 1,
+	SMC_PNET_IB	= 2,
 };
 
 /* pnet entry stored in pnet table */
 struct smc_pnetentry {
 	struct list_head list;
 	char pnet_name[SMC_MAX_PNETID_LEN + 1];
-	struct net_device *ndev;
+	enum smc_pnet_nametype type;
+	union {
+		struct {
+			char eth_name[IFNAMSIZ + 1];
+			struct net_device *ndev;
+		};
+		struct {
+			char ib_name[IB_DEVICE_NAME_MAX + 1];
+			u8 ib_port;
+		};
+	};
 };
 
 /* Check if two given pnetids match */
@@ -106,14 +103,15 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 	sn = net_generic(net, smc_net_id);
 	pnettable = &sn->pnettable;
 
-	/* remove netdevices */
+	/* remove table entry */
 	write_lock(&pnettable->lock);
 	list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
 				 list) {
 		if (!pnet_name ||
 		    smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
 			list_del(&pnetelem->list);
-			dev_put(pnetelem->ndev);
+			if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev)
+				dev_put(pnetelem->ndev);
 			kfree(pnetelem);
 			rc = 0;
 		}
@@ -155,9 +153,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 	return rc;
 }
 
-/* Remove a pnet entry mentioning a given network device from the pnet table.
+/* Add the reference to a given network device to the pnet table.
  */
-static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+static int smc_pnet_add_by_ndev(struct net_device *ndev)
 {
 	struct smc_pnetentry *pnetelem, *tmp_pe;
 	struct smc_pnettable *pnettable;
@@ -171,10 +169,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
 
 	write_lock(&pnettable->lock);
 	list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
-		if (pnetelem->ndev == ndev) {
-			list_del(&pnetelem->list);
-			dev_put(pnetelem->ndev);
-			kfree(pnetelem);
+		if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
+		    !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
+			dev_hold(ndev);
+			pnetelem->ndev = ndev;
 			rc = 0;
 			break;
 		}
@@ -183,80 +181,67 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
 	return rc;
 }
 
-/* Append a pnetid to the end of the pnet table if not already on this list.
+/* Remove the reference to a given network device from the pnet table.
  */
-static int smc_pnet_enter(struct smc_pnettable *pnettable,
-			  struct smc_user_pnetentry *new_pnetelem)
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
 {
-	u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
-	u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
-	struct smc_pnetentry *tmp_pnetelem;
-	struct smc_pnetentry *pnetelem;
-	bool new_smcddev = false;
-	struct net_device *ndev;
-	bool new_netdev = true;
-	bool new_ibdev = false;
-
-	if (new_pnetelem->smcibdev) {
-		struct smc_ib_device *ib_dev = new_pnetelem->smcibdev;
-		int ib_port = new_pnetelem->ib_port;
+	struct smc_pnetentry *pnetelem, *tmp_pe;
+	struct smc_pnettable *pnettable;
+	struct net *net = dev_net(ndev);
+	struct smc_net *sn;
+	int rc = -ENOENT;
 
-		spin_lock(&smc_ib_devices.lock);
-		if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) {
-			memcpy(ib_dev->pnetid[ib_port - 1],
-			       new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
-			ib_dev->pnetid_by_user[ib_port - 1] = true;
-			new_ibdev = true;
-		}
-		spin_unlock(&smc_ib_devices.lock);
-	}
-	if (new_pnetelem->smcd_dev) {
-		struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev;
+	/* get pnettable for namespace */
+	sn = net_generic(net, smc_net_id);
+	pnettable = &sn->pnettable;
 
-		spin_lock(&smcd_dev_list.lock);
-		if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) {
-			memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name,
-			       SMC_MAX_PNETID_LEN);
-			smcd_dev->pnetid_by_user = true;
-			new_smcddev = true;
+	write_lock(&pnettable->lock);
+	list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+		if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
+			dev_put(pnetelem->ndev);
+			pnetelem->ndev = NULL;
+			rc = 0;
+			break;
 		}
-		spin_unlock(&smcd_dev_list.lock);
 	}
+	write_unlock(&pnettable->lock);
+	return rc;
+}
 
-	if (!new_pnetelem->ndev)
-		return (new_ibdev || new_smcddev) ? 0 : -EEXIST;
+/* Apply pnetid to ib device when no pnetid is set.
+ */
+static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
+			      char *pnet_name)
+{
+	u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
+	bool applied = false;
 
-	/* check if (base) netdev already has a pnetid. If there is one, we do
-	 * not want to add a pnet table entry
-	 */
-	ndev = pnet_find_base_ndev(new_pnetelem->ndev);
-	if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
-				    ndev_pnetid))
-		return (new_ibdev || new_smcddev) ? 0 : -EEXIST;
+	spin_lock(&smc_ib_devices.lock);
+	if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) {
+		memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
+		       SMC_MAX_PNETID_LEN);
+		ib_dev->pnetid_by_user[ib_port - 1] = true;
+		applied = true;
+	}
+	spin_unlock(&smc_ib_devices.lock);
+	return applied;
+}
 
-	/* add a new netdev entry to the pnet table if there isn't one */
-	tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
-	if (!tmp_pnetelem)
-		return -ENOMEM;
-	memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name,
-	       SMC_MAX_PNETID_LEN);
-	tmp_pnetelem->ndev = new_pnetelem->ndev;
+/* Apply pnetid to smcd device when no pnetid is set.
+ */
+static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
+{
+	u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
+	bool applied = false;
 
-	write_lock(&pnettable->lock);
-	list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
-		if (pnetelem->ndev == new_pnetelem->ndev)
-			new_netdev = false;
-	}
-	if (new_netdev) {
-		dev_hold(tmp_pnetelem->ndev);
-		list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist);
-		write_unlock(&pnettable->lock);
-	} else {
-		write_unlock(&pnettable->lock);
-		kfree(tmp_pnetelem);
+	spin_lock(&smcd_dev_list.lock);
+	if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) {
+		memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
+		smcd_dev->pnetid_by_user = true;
+		applied = true;
 	}
-
-	return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST;
+	spin_unlock(&smcd_dev_list.lock);
+	return applied;
 }
 
 /* The limit for pnetid is 16 characters.
@@ -323,57 +308,167 @@ out:
 	return smcd_dev;
 }
 
-/* Parse the supplied netlink attributes and fill a pnetentry structure.
- * For ethernet and infiniband device names verify that the devices exist.
+static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
+			    char *eth_name, char *pnet_name)
+{
+	struct smc_pnetentry *tmp_pe, *new_pe;
+	struct net_device *ndev, *base_ndev;
+	u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+	bool new_netdev;
+	int rc;
+
+	/* check if (base) netdev already has a pnetid. If there is one, we do
+	 * not want to add a pnet table entry
+	 */
+	rc = -EEXIST;
+	ndev = dev_get_by_name(net, eth_name);	/* dev_hold() */
+	if (ndev) {
+		base_ndev = pnet_find_base_ndev(ndev);
+		if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
+					    base_ndev->dev_port, ndev_pnetid))
+			goto out_put;
+	}
+
+	/* add a new netdev entry to the pnet table if there isn't one */
+	rc = -ENOMEM;
+	new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+	if (!new_pe)
+		goto out_put;
+	new_pe->type = SMC_PNET_ETH;
+	memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+	strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
+	new_pe->ndev = ndev;
+
+	rc = -EEXIST;
+	new_netdev = true;
+	write_lock(&pnettable->lock);
+	list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+		if (tmp_pe->type == SMC_PNET_ETH &&
+		    !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
+			new_netdev = false;
+			break;
+		}
+	}
+	if (new_netdev) {
+		list_add_tail(&new_pe->list, &pnettable->pnetlist);
+		write_unlock(&pnettable->lock);
+	} else {
+		write_unlock(&pnettable->lock);
+		kfree(new_pe);
+		goto out_put;
+	}
+	return 0;
+
+out_put:
+	if (ndev)
+		dev_put(ndev);
+	return rc;
+}
+
+static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
+			   u8 ib_port, char *pnet_name)
+{
+	struct smc_pnetentry *tmp_pe, *new_pe;
+	struct smc_ib_device *ib_dev;
+	bool smcddev_applied = true;
+	bool ibdev_applied = true;
+	struct smcd_dev *smcd_dev;
+	bool new_ibdev;
+
+	/* try to apply the pnetid to active devices */
+	ib_dev = smc_pnet_find_ib(ib_name);
+	if (ib_dev)
+		ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
+	smcd_dev = smc_pnet_find_smcd(ib_name);
+	if (smcd_dev)
+		smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name);
+	/* Apply fails when a device has a hardware-defined pnetid set, do not
+	 * add a pnet table entry in that case.
+	 */
+	if (!ibdev_applied || !smcddev_applied)
+		return -EEXIST;
+
+	/* add a new ib entry to the pnet table if there isn't one */
+	new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+	if (!new_pe)
+		return -ENOMEM;
+	new_pe->type = SMC_PNET_IB;
+	memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+	strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+	new_pe->ib_port = ib_port;
+
+	new_ibdev = true;
+	write_lock(&pnettable->lock);
+	list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+		if (tmp_pe->type == SMC_PNET_IB &&
+		    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+			new_ibdev = false;
+			break;
+		}
+	}
+	if (new_ibdev) {
+		list_add_tail(&new_pe->list, &pnettable->pnetlist);
+		write_unlock(&pnettable->lock);
+	} else {
+		write_unlock(&pnettable->lock);
+		kfree(new_pe);
+	}
+	return (new_ibdev) ? 0 : -EEXIST;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
  */
-static int smc_pnet_fill_entry(struct net *net,
-			       struct smc_user_pnetentry *pnetelem,
-			       struct nlattr *tb[])
+static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
 {
-	char *string, *ibname;
+	char pnet_name[SMC_MAX_PNETID_LEN + 1];
+	struct smc_pnettable *pnettable;
+	bool new_netdev = false;
+	bool new_ibdev = false;
+	struct smc_net *sn;
+	u8 ibport = 1;
+	char *string;
 	int rc;
 
-	memset(pnetelem, 0, sizeof(*pnetelem));
-	INIT_LIST_HEAD(&pnetelem->list);
+	/* get pnettable for namespace */
+	sn = net_generic(net, smc_net_id);
+	pnettable = &sn->pnettable;
 
 	rc = -EINVAL;
 	if (!tb[SMC_PNETID_NAME])
 		goto error;
 	string = (char *)nla_data(tb[SMC_PNETID_NAME]);
-	if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+	if (!smc_pnetid_valid(string, pnet_name))
 		goto error;
 
-	rc = -EINVAL;
 	if (tb[SMC_PNETID_ETHNAME]) {
 		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
-		pnetelem->ndev = dev_get_by_name(net, string);
-		if (!pnetelem->ndev)
+		rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
+		if (!rc)
+			new_netdev = true;
+		else if (rc != -EEXIST)
 			goto error;
 	}
 
 	/* if this is not the initial namespace, stop here */
 	if (net != &init_net)
-		return 0;
+		return new_netdev ? 0 : -EEXIST;
 
 	rc = -EINVAL;
 	if (tb[SMC_PNETID_IBNAME]) {
-		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
-		ibname = strim(ibname);
-		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
-		pnetelem->smcd_dev = smc_pnet_find_smcd(ibname);
-		if (!pnetelem->smcibdev && !pnetelem->smcd_dev)
-			goto error;
-		if (pnetelem->smcibdev) {
-			if (!tb[SMC_PNETID_IBPORT])
-				goto error;
-			pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
-			if (pnetelem->ib_port < 1 ||
-			    pnetelem->ib_port > SMC_MAX_PORTS)
+		string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+		string = strim(string);
+		if (tb[SMC_PNETID_IBPORT]) {
+			ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+			if (ibport < 1 || ibport > SMC_MAX_PORTS)
 				goto error;
 		}
+		rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
+		if (!rc)
+			new_ibdev = true;
+		else if (rc != -EEXIST)
+			goto error;
 	}
-
-	return 0;
+	return (new_netdev || new_ibdev) ? 0 : -EEXIST;
 
 error:
 	return rc;
@@ -381,28 +476,22 @@ error:
 
 /* Convert an smc_pnetentry to a netlink attribute sequence */
 static int smc_pnet_set_nla(struct sk_buff *msg,
-			    struct smc_user_pnetentry *pnetelem)
+			    struct smc_pnetentry *pnetelem)
 {
 	if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
 		return -1;
-	if (pnetelem->ndev) {
+	if (pnetelem->type == SMC_PNET_ETH) {
 		if (nla_put_string(msg, SMC_PNETID_ETHNAME,
-				   pnetelem->ndev->name))
+				   pnetelem->eth_name))
 			return -1;
 	} else {
 		if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
 			return -1;
 	}
-	if (pnetelem->smcibdev) {
-		if (nla_put_string(msg, SMC_PNETID_IBNAME,
-			dev_name(pnetelem->smcibdev->ibdev->dev.parent)) ||
+	if (pnetelem->type == SMC_PNET_IB) {
+		if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
 		    nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
 			return -1;
-	} else if (pnetelem->smcd_dev) {
-		if (nla_put_string(msg, SMC_PNETID_IBNAME,
-				   dev_name(&pnetelem->smcd_dev->dev)) ||
-		    nla_put_u8(msg, SMC_PNETID_IBPORT, 1))
-			return -1;
 	} else {
 		if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
 		    nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
@@ -415,21 +504,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg,
 static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
-	struct smc_user_pnetentry pnetelem;
-	struct smc_pnettable *pnettable;
-	struct smc_net *sn;
-	int rc;
-
-	/* get pnettable for namespace */
-	sn = net_generic(net, smc_net_id);
-	pnettable = &sn->pnettable;
 
-	rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs);
-	if (!rc)
-		rc = smc_pnet_enter(pnettable, &pnetelem);
-	if (pnetelem.ndev)
-		dev_put(pnetelem.ndev);
-	return rc;
+	return smc_pnet_enter(net, info->attrs);
 }
 
 static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
@@ -450,7 +526,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb)
 
 static int smc_pnet_dumpinfo(struct sk_buff *skb,
 			     u32 portid, u32 seq, u32 flags,
-			     struct smc_user_pnetentry *pnetelem)
+			     struct smc_pnetentry *pnetelem)
 {
 	void *hdr;
 
@@ -469,91 +545,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb,
 static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
 			  u32 seq, u8 *pnetid, int start_idx)
 {
-	struct smc_user_pnetentry tmp_entry;
 	struct smc_pnettable *pnettable;
 	struct smc_pnetentry *pnetelem;
-	struct smc_ib_device *ibdev;
-	struct smcd_dev *smcd_dev;
 	struct smc_net *sn;
 	int idx = 0;
-	int ibport;
 
 	/* get pnettable for namespace */
 	sn = net_generic(net, smc_net_id);
 	pnettable = &sn->pnettable;
 
-	/* dump netdevices */
+	/* dump pnettable entries */
 	read_lock(&pnettable->lock);
 	list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
 		if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
 			continue;
 		if (idx++ < start_idx)
 			continue;
-		memset(&tmp_entry, 0, sizeof(tmp_entry));
-		memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name,
-		       SMC_MAX_PNETID_LEN);
-		tmp_entry.ndev = pnetelem->ndev;
+		/* if this is not the initial namespace, dump only netdev */
+		if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
+			continue;
 		if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
-				      &tmp_entry)) {
+				      pnetelem)) {
 			--idx;
 			break;
 		}
 	}
 	read_unlock(&pnettable->lock);
-
-	/* if this is not the initial namespace, stop here */
-	if (net != &init_net)
-		return idx;
-
-	/* dump ib devices */
-	spin_lock(&smc_ib_devices.lock);
-	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
-		for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
-			if (ibdev->pnetid_by_user[ibport]) {
-				if (pnetid &&
-				    !smc_pnet_match(ibdev->pnetid[ibport],
-						    pnetid))
-					continue;
-				if (idx++ < start_idx)
-					continue;
-				memset(&tmp_entry, 0, sizeof(tmp_entry));
-				memcpy(&tmp_entry.pnet_name,
-				       ibdev->pnetid[ibport],
-				       SMC_MAX_PNETID_LEN);
-				tmp_entry.smcibdev = ibdev;
-				tmp_entry.ib_port = ibport + 1;
-				if (smc_pnet_dumpinfo(skb, portid, seq,
-						      NLM_F_MULTI,
-						      &tmp_entry)) {
-					--idx;
-					break;
-				}
-			}
-		}
-	}
-	spin_unlock(&smc_ib_devices.lock);
-
-	/* dump smcd devices */
-	spin_lock(&smcd_dev_list.lock);
-	list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
-		if (smcd_dev->pnetid_by_user) {
-			if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid))
-				continue;
-			if (idx++ < start_idx)
-				continue;
-			memset(&tmp_entry, 0, sizeof(tmp_entry));
-			memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid,
-			       SMC_MAX_PNETID_LEN);
-			tmp_entry.smcd_dev = smcd_dev;
-			if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
-					      &tmp_entry)) {
-				--idx;
-				break;
-			}
-		}
-	}
-	spin_unlock(&smcd_dev_list.lock);
-
 	return idx;
 }
 
@@ -659,6 +676,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this,
 	case NETDEV_UNREGISTER:
 		smc_pnet_remove_by_ndev(event_dev);
 		return NOTIFY_OK;
+	case NETDEV_REGISTER:
+		smc_pnet_add_by_ndev(event_dev);
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
@@ -744,7 +764,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
 
 	read_lock(&pnettable->lock);
 	list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
-		if (ndev == pnetelem->ndev) {
+		if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
 			/* get pnetid of netdev device */
 			memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
 			rc = 0;
@@ -755,6 +775,34 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
 	return rc;
 }
 
+/* find a roce device for the given pnetid */
+static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
+					  struct smc_init_info *ini)
+{
+	struct smc_ib_device *ibdev;
+	int i;
+
+	ini->ib_dev = NULL;
+	spin_lock(&smc_ib_devices.lock);
+	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+		for (i = 1; i <= SMC_MAX_PORTS; i++) {
+			if (!rdma_is_port_valid(ibdev->ibdev, i))
+				continue;
+			if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
+			    smc_ib_port_active(ibdev, i) &&
+			    !test_bit(i - 1, ibdev->ports_going_away) &&
+			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
+						  ini->ib_gid, NULL)) {
+				ini->ib_dev = ibdev;
+				ini->ib_port = i;
+				goto out;
+			}
+		}
+	}
+out:
+	spin_unlock(&smc_ib_devices.lock);
+}
+
 /* if handshake network device belongs to a roce device, return its
  * IB device and port
  */
@@ -801,8 +849,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
 					 struct smc_init_info *ini)
 {
 	u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
-	struct smc_ib_device *ibdev;
-	int i;
 
 	ndev = pnet_find_base_ndev(ndev);
 	if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
@@ -811,25 +857,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
 		smc_pnet_find_rdma_dev(ndev, ini);
 		return; /* pnetid could not be determined */
 	}
-
-	spin_lock(&smc_ib_devices.lock);
-	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
-		for (i = 1; i <= SMC_MAX_PORTS; i++) {
-			if (!rdma_is_port_valid(ibdev->ibdev, i))
-				continue;
-			if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) &&
-			    smc_ib_port_active(ibdev, i) &&
-			    !test_bit(i - 1, ibdev->ports_going_away) &&
-			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
-						  ini->ib_gid, NULL)) {
-				ini->ib_dev = ibdev;
-				ini->ib_port = i;
-				goto out;
-			}
-		}
-	}
-out:
-	spin_unlock(&smc_ib_devices.lock);
+	_smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini);
 }
 
 static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
@@ -895,3 +923,60 @@ out_rel:
 out:
 	return;
 }
+
+/* Lookup and apply a pnet table entry to the given ib device.
+ */
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
+{
+	char *ib_name = smcibdev->ibdev->name;
+	struct smc_pnettable *pnettable;
+	struct smc_pnetentry *tmp_pe;
+	struct smc_net *sn;
+	int rc = -ENOENT;
+
+	/* get pnettable for init namespace */
+	sn = net_generic(&init_net, smc_net_id);
+	pnettable = &sn->pnettable;
+
+	read_lock(&pnettable->lock);
+	list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+		if (tmp_pe->type == SMC_PNET_IB &&
+		    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
+		    tmp_pe->ib_port == ib_port) {
+			smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
+			rc = 0;
+			break;
+		}
+	}
+	read_unlock(&pnettable->lock);
+
+	return rc;
+}
+
+/* Lookup and apply a pnet table entry to the given smcd device.
+ */
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
+{
+	const char *ib_name = dev_name(&smcddev->dev);
+	struct smc_pnettable *pnettable;
+	struct smc_pnetentry *tmp_pe;
+	struct smc_net *sn;
+	int rc = -ENOENT;
+
+	/* get pnettable for init namespace */
+	sn = net_generic(&init_net, smc_net_id);
+	pnettable = &sn->pnettable;
+
+	read_lock(&pnettable->lock);
+	list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+		if (tmp_pe->type == SMC_PNET_IB &&
+		    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+			smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
+			rc = 0;
+			break;
+		}
+	}
+	read_unlock(&pnettable->lock);
+
+	return rc;
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 4564e4d69c2e..ea207f8fc6f7 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -46,5 +46,7 @@ void smc_pnet_exit(void);
 void smc_pnet_net_exit(struct net *net);
 void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
 void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
 
 #endif
-- 
cgit v1.2.3


From f3c1deddb21c19fb0eec3c61e80567ef4a79b58b Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:38 +0200
Subject: net/smc: separate function for link initialization

Move the initialization of a new link into its own function, separate
from smc_lgr_create, to allow more than one link per link group.
Do an extra check if the IB device initialization was successful, and
reset the link state if any error occurs during smcr_link_init().
And rename two existing functions to use the prefix smcr_ to indicate
that they belong to the SMC-R code path.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 114 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 824c5211b027..3bb45c33db22 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -179,7 +179,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
  * of the DELETE LINK sequence from server; or as server to
  * initiate the delete processing. See smc_llc_rx_delete_link().
  */
-static int smc_link_send_delete(struct smc_link *lnk, bool orderly)
+static int smcr_link_send_delete(struct smc_link *lnk, bool orderly)
 {
 	if (lnk->state == SMC_LNK_ACTIVE &&
 	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
@@ -219,7 +219,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 	if (!lgr->is_smcd && !lgr->terminating)	{
 		/* try to send del link msg, on error free lgr immediately */
 		if (lnk->state == SMC_LNK_ACTIVE &&
-		    !smc_link_send_delete(lnk, true)) {
+		    !smcr_link_send_delete(lnk, true)) {
 			/* reschedule in case we never receive a response */
 			smc_lgr_schedule_free_work(lgr);
 			spin_unlock_bh(lgr_lock);
@@ -245,6 +245,64 @@ static void smc_lgr_terminate_work(struct work_struct *work)
 	__smc_lgr_terminate(lgr, true);
 }
 
+static int smcr_link_init(struct smc_link *lnk, u8 link_id,
+			  struct smc_init_info *ini)
+{
+	u8 rndvec[3];
+	int rc;
+
+	get_device(&ini->ib_dev->ibdev->dev);
+	atomic_inc(&ini->ib_dev->lnk_cnt);
+	lnk->state = SMC_LNK_ACTIVATING;
+	lnk->link_id = link_id;
+	lnk->smcibdev = ini->ib_dev;
+	lnk->ibport = ini->ib_port;
+	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+	if (!ini->ib_dev->initialized) {
+		rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
+		if (rc)
+			goto out;
+	}
+	get_random_bytes(rndvec, sizeof(rndvec));
+	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+		(rndvec[2] << 16);
+	rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+				  ini->vlan_id, lnk->gid, &lnk->sgid_index);
+	if (rc)
+		goto out;
+	rc = smc_llc_link_init(lnk);
+	if (rc)
+		goto out;
+	rc = smc_wr_alloc_link_mem(lnk);
+	if (rc)
+		goto clear_llc_lnk;
+	rc = smc_ib_create_protection_domain(lnk);
+	if (rc)
+		goto free_link_mem;
+	rc = smc_ib_create_queue_pair(lnk);
+	if (rc)
+		goto dealloc_pd;
+	rc = smc_wr_create_link(lnk);
+	if (rc)
+		goto destroy_qp;
+	return 0;
+
+destroy_qp:
+	smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+	smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+	smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+	smc_llc_link_clear(lnk);
+out:
+	put_device(&ini->ib_dev->ibdev->dev);
+	memset(lnk, 0, sizeof(struct smc_link));
+	if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
+		wake_up(&ini->ib_dev->lnks_deleted);
+	return rc;
+}
+
 /* create a new SMC link group */
 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 {
@@ -252,7 +310,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	struct list_head *lgr_list;
 	struct smc_link *lnk;
 	spinlock_t *lgr_lock;
-	u8 rndvec[3];
 	int rc = 0;
 	int i;
 
@@ -297,48 +354,17 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		atomic_inc(&ini->ism_dev->lgr_cnt);
 	} else {
 		/* SMC-R specific settings */
-		get_device(&ini->ib_dev->ibdev->dev);
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
 		       SMC_SYSTEMID_LEN);
 
 		lnk = &lgr->lnk[SMC_SINGLE_LINK];
-		/* initialize link */
-		lnk->state = SMC_LNK_ACTIVATING;
-		lnk->link_id = SMC_SINGLE_LINK;
-		lnk->smcibdev = ini->ib_dev;
-		lnk->ibport = ini->ib_port;
-		lgr_list = &smc_lgr_list.list;
-		lgr_lock = &smc_lgr_list.lock;
-		lnk->path_mtu =
-			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
-		if (!ini->ib_dev->initialized)
-			smc_ib_setup_per_ibdev(ini->ib_dev);
-		get_random_bytes(rndvec, sizeof(rndvec));
-		lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
-			(rndvec[2] << 16);
-		rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
-					  ini->vlan_id, lnk->gid,
-					  &lnk->sgid_index);
-		if (rc)
-			goto free_lgr;
-		rc = smc_llc_link_init(lnk);
+		rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini);
 		if (rc)
 			goto free_lgr;
-		rc = smc_wr_alloc_link_mem(lnk);
-		if (rc)
-			goto clear_llc_lnk;
-		rc = smc_ib_create_protection_domain(lnk);
-		if (rc)
-			goto free_link_mem;
-		rc = smc_ib_create_queue_pair(lnk);
-		if (rc)
-			goto dealloc_pd;
-		rc = smc_wr_create_link(lnk);
-		if (rc)
-			goto destroy_qp;
+		lgr_list = &smc_lgr_list.list;
+		lgr_lock = &smc_lgr_list.lock;
 		atomic_inc(&lgr_cnt);
-		atomic_inc(&ini->ib_dev->lnk_cnt);
 	}
 	smc->conn.lgr = lgr;
 	spin_lock_bh(lgr_lock);
@@ -346,14 +372,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	spin_unlock_bh(lgr_lock);
 	return 0;
 
-destroy_qp:
-	smc_ib_destroy_queue_pair(lnk);
-dealloc_pd:
-	smc_ib_dealloc_protection_domain(lnk);
-free_link_mem:
-	smc_wr_free_link_mem(lnk);
-clear_llc_lnk:
-	smc_llc_link_clear(lnk);
 free_lgr:
 	kfree(lgr);
 ism_put_vlan:
@@ -417,7 +435,7 @@ void smc_conn_free(struct smc_connection *conn)
 		smc_lgr_schedule_free_work(lgr);
 }
 
-static void smc_link_clear(struct smc_link *lnk)
+static void smcr_link_clear(struct smc_link *lnk)
 {
 	lnk->peer_qpn = 0;
 	smc_llc_link_clear(lnk);
@@ -426,6 +444,7 @@ static void smc_link_clear(struct smc_link *lnk)
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
 	smc_wr_free_link_mem(lnk);
+	put_device(&lnk->smcibdev->ibdev->dev);
 	if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt))
 		wake_up(&lnk->smcibdev->lnks_deleted);
 }
@@ -512,8 +531,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
 			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
-		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
-		put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
+		smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
-- 
cgit v1.2.3


From 026c381fb4778d0d44af57b7ff674f31f04af221 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:39 +0200
Subject: net/smc: introduce link_idx for link group array

The link_id is the index of the link in the array of the link group.
When a link in the array is reused for a new link, a different unique
link_id should be used, otherwise the index in the array could collide
with the previous link at this array position.
Use a new variable link_idx as array index, and make link_id an
increasing unique id value.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 34 +++++++++++++++++++++++++++++-----
 net/smc/smc_core.h |  2 ++
 2 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 3bb45c33db22..d233112ced2a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -245,8 +245,28 @@ static void smc_lgr_terminate_work(struct work_struct *work)
 	__smc_lgr_terminate(lgr, true);
 }
 
-static int smcr_link_init(struct smc_link *lnk, u8 link_id,
-			  struct smc_init_info *ini)
+/* return next unique link id for the lgr */
+static u8 smcr_next_link_id(struct smc_link_group *lgr)
+{
+	u8 link_id;
+	int i;
+
+	while (1) {
+		link_id = ++lgr->next_link_id;
+		if (!link_id)	/* skip zero as link_id */
+			link_id = ++lgr->next_link_id;
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			if (lgr->lnk[i].state != SMC_LNK_INACTIVE &&
+			    lgr->lnk[i].link_id == link_id)
+				continue;
+		}
+		break;
+	}
+	return link_id;
+}
+
+static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+			  u8 link_idx, struct smc_init_info *ini)
 {
 	u8 rndvec[3];
 	int rc;
@@ -254,7 +274,8 @@ static int smcr_link_init(struct smc_link *lnk, u8 link_id,
 	get_device(&ini->ib_dev->ibdev->dev);
 	atomic_inc(&ini->ib_dev->lnk_cnt);
 	lnk->state = SMC_LNK_ACTIVATING;
-	lnk->link_id = link_id;
+	lnk->link_id = smcr_next_link_id(lgr);
+	lnk->link_idx = link_idx;
 	lnk->smcibdev = ini->ib_dev;
 	lnk->ibport = ini->ib_port;
 	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
@@ -310,6 +331,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	struct list_head *lgr_list;
 	struct smc_link *lnk;
 	spinlock_t *lgr_lock;
+	u8 link_idx;
 	int rc = 0;
 	int i;
 
@@ -338,6 +360,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
 	}
+	lgr->next_link_id = 0;
 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
 	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
@@ -358,8 +381,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
 		       SMC_SYSTEMID_LEN);
 
-		lnk = &lgr->lnk[SMC_SINGLE_LINK];
-		rc = smcr_link_init(lnk, SMC_SINGLE_LINK, ini);
+		link_idx = SMC_SINGLE_LINK;
+		lnk = &lgr->lnk[link_idx];
+		rc = smcr_link_init(lgr, lnk, link_idx, ini);
 		if (rc)
 			goto free_lgr;
 		lgr_list = &smc_lgr_list.list;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 8041db20c753..c459b0639bf3 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -115,6 +115,7 @@ struct smc_link {
 	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
 	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
+	u8			link_idx;	/* index in lgr link array */
 
 	enum smc_link_state	state;		/* state of link */
 	struct workqueue_struct *llc_wq;	/* single thread work queue */
@@ -222,6 +223,7 @@ struct smc_link_group {
 						/* remote addr/key pairs */
 			DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
 						/* used rtoken elements */
+			u8			next_link_id;
 		};
 		struct { /* SMC-D */
 			u64			peer_gid;
-- 
cgit v1.2.3


From 387707fdf48697c667dd5e9715ac4feb41602d15 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:40 +0200
Subject: net/smc: convert static link ID to dynamic references

As a preparation for the support of multiple links remove the usage of
a static link id (SMC_SINGLE_LINK) and allow dynamic link ids.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 17 +++++---------
 net/smc/smc.h      |  1 +
 net/smc/smc_cdc.c  |  8 +++----
 net/smc/smc_clc.c  | 12 +++++-----
 net/smc/smc_core.c | 66 +++++++++++++++++++++++++++---------------------------
 net/smc/smc_core.h |  7 +++---
 net/smc/smc_ib.c   | 58 +++++++++++++++++++++++------------------------
 net/smc/smc_ib.h   | 10 ++++-----
 net/smc/smc_llc.c  | 10 ++++-----
 net/smc/smc_tx.c   | 13 ++++++-----
 10 files changed, 99 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6fd44bdb0fc3..6e4bad8c64a8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -343,7 +343,7 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
 {
 	if (!rmb_desc->wr_reg) {
 		/* register memory region for new rmb */
-		if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
+		if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
 			rmb_desc->regerr = 1;
 			return -EFAULT;
 		}
@@ -362,12 +362,10 @@ static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
 static int smc_clnt_conf_first_link(struct smc_sock *smc)
 {
 	struct net *net = sock_net(smc->clcsock->sk);
-	struct smc_link_group *lgr = smc->conn.lgr;
-	struct smc_link *link;
+	struct smc_link *link = smc->conn.lnk;
 	int rest;
 	int rc;
 
-	link = &lgr->lnk[SMC_SINGLE_LINK];
 	/* receive CONFIRM LINK request from server over RoCE fabric */
 	rest = wait_for_completion_interruptible_timeout(
 		&link->llc_confirm,
@@ -610,7 +608,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 		mutex_unlock(&smc_client_lgr_pending);
 		return reason_code;
 	}
-	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	link = smc->conn.lnk;
 
 	smc_conn_save_peer_info(smc, aclc);
 
@@ -1002,13 +1000,10 @@ void smc_close_non_accepted(struct sock *sk)
 static int smc_serv_conf_first_link(struct smc_sock *smc)
 {
 	struct net *net = sock_net(smc->clcsock->sk);
-	struct smc_link_group *lgr = smc->conn.lgr;
-	struct smc_link *link;
+	struct smc_link *link = smc->conn.lnk;
 	int rest;
 	int rc;
 
-	link = &lgr->lnk[SMC_SINGLE_LINK];
-
 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
@@ -1194,7 +1189,7 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
 /* listen worker: register buffers */
 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
 {
-	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *link = new_smc->conn.lnk;
 
 	if (local_contact != SMC_FIRST_CONTACT) {
 		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
@@ -1210,7 +1205,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 				  struct smc_clc_msg_accept_confirm *cclc,
 				  int local_contact)
 {
-	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *link = new_smc->conn.lnk;
 	int reason_code = 0;
 
 	if (local_contact == SMC_FIRST_CONTACT)
diff --git a/net/smc/smc.h b/net/smc/smc.h
index be11ba41190f..1a084afa7372 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -121,6 +121,7 @@ enum smc_urg_state {
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
+	struct smc_link		*lnk;		/* assigned SMC-R link */
 	u32			alert_token_local; /* unique conn. id */
 	u8			peer_rmbe_idx;	/* from tcp handshake */
 	int			peer_rmbe_size;	/* size of peer rx buffer */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 164f1584861b..f64589d823aa 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -57,7 +57,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
 			  struct smc_rdma_wr **wr_rdma_buf,
 			  struct smc_cdc_tx_pend **pend)
 {
-	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *link = conn->lnk;
 	int rc;
 
 	rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
@@ -91,12 +91,10 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 		     struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend)
 {
+	struct smc_link *link = conn->lnk;
 	union smc_host_cursor cfed;
-	struct smc_link *link;
 	int rc;
 
-	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
-
 	smc_cdc_add_pending_send(conn, pend);
 
 	conn->tx_cdc_seq++;
@@ -165,7 +163,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
 
 void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
 {
-	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *link = conn->lnk;
 
 	smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
 				smc_cdc_tx_filter, smc_cdc_tx_dismisser,
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index ea0068f0173c..d5627df24215 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 		       sizeof(SMCD_EYECATCHER));
 	} else {
 		/* SMC-R specific settings */
-		link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+		link = conn->lnk;
 		memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
 		       sizeof(SMC_EYECATCHER));
 		cclc.hdr.path = SMC_TYPE_R;
@@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 		       ETH_ALEN);
 		hton24(cclc.qpn, link->roce_qp->qp_num);
 		cclc.rmb_rkey =
-			htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+			htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
 		cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
 		cclc.rmbe_alert_token = htonl(conn->alert_token_local);
 		cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
 		cclc.rmbe_size = conn->rmbe_size_short;
 		cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
-				(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+				(conn->rmb_desc->sgt[link->link_idx].sgl));
 		hton24(cclc.psn, link->psn_initial);
 		memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
 		       sizeof(SMC_EYECATCHER));
@@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 		memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
 		       sizeof(SMC_EYECATCHER));
 		aclc.hdr.path = SMC_TYPE_R;
-		link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+		link = conn->lnk;
 		memcpy(aclc.lcl.id_for_peer, local_systemid,
 		       sizeof(local_systemid));
 		memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
@@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 		       ETH_ALEN);
 		hton24(aclc.qpn, link->roce_qp->qp_num);
 		aclc.rmb_rkey =
-			htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+			htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
 		aclc.rmbe_idx = 1;		/* as long as 1 RMB = 1 RMBE */
 		aclc.rmbe_alert_token = htonl(conn->alert_token_local);
 		aclc.qp_mtu = link->path_mtu;
 		aclc.rmbe_size = conn->rmbe_size_short,
 		aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
-				(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+				(conn->rmb_desc->sgt[link->link_idx].sgl));
 		hton24(aclc.psn, link->psn_initial);
 		memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
 		       sizeof(SMC_EYECATCHER));
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d233112ced2a..1d695093f205 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -131,6 +131,11 @@ static void smc_lgr_register_conn(struct smc_connection *conn)
 			conn->alert_token_local = 0;
 	}
 	smc_lgr_add_alert_token(conn);
+
+	/* assign the new connection to a link */
+	if (!conn->lgr->is_smcd)
+		conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
 	conn->lgr->conns_num++;
 }
 
@@ -275,6 +280,7 @@ static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	atomic_inc(&ini->ib_dev->lnk_cnt);
 	lnk->state = SMC_LNK_ACTIVATING;
 	lnk->link_id = smcr_next_link_id(lgr);
+	lnk->lgr = lgr;
 	lnk->link_idx = link_idx;
 	lnk->smcibdev = ini->ib_dev;
 	lnk->ibport = ini->ib_port;
@@ -421,7 +427,7 @@ static void smc_buf_unuse(struct smc_connection *conn,
 			if (!lgr->is_smcd && !list_empty(&lgr->list)) {
 				/* unregister rmb with peer */
 				smc_llc_do_delete_rkey(
-						&lgr->lnk[SMC_SINGLE_LINK],
+						conn->lnk,
 						conn->rmb_desc);
 			}
 			conn->rmb_desc->used = 0;
@@ -479,16 +485,15 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
 
 	if (is_rmb) {
-		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
+		if (buf_desc->mr_rx[lnk->link_idx])
 			smc_ib_put_memory_region(
-					buf_desc->mr_rx[SMC_SINGLE_LINK]);
-		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
-				    DMA_FROM_DEVICE);
+					buf_desc->mr_rx[lnk->link_idx]);
+		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
 	} else {
-		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
-				    DMA_TO_DEVICE);
+		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
 	}
-	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
+	sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+
 	if (buf_desc->pages)
 		__free_pages(buf_desc->pages, buf_desc->order);
 	kfree(buf_desc);
@@ -1026,17 +1031,16 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 
 	/* build the sg table from the pages */
 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
-	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
-			    GFP_KERNEL);
+	rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
 	if (rc) {
 		smc_buf_free(lgr, is_rmb, buf_desc);
 		return ERR_PTR(rc);
 	}
-	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
+	sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
 		   buf_desc->cpu_addr, bufsize);
 
 	/* map sg table to DMA address */
-	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
+	rc = smc_ib_buf_map_sg(lnk, buf_desc,
 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	/* SMC protocol depends on mapping to one DMA address only */
 	if (rc != 1)  {
@@ -1049,7 +1053,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 		rc = smc_ib_get_memory_region(lnk->roce_pd,
 					      IB_ACCESS_REMOTE_WRITE |
 					      IB_ACCESS_LOCAL_WRITE,
-					      buf_desc);
+					      buf_desc, lnk->link_idx);
 		if (rc) {
 			smc_buf_free(lgr, is_rmb, buf_desc);
 			return ERR_PTR(rc);
@@ -1174,22 +1178,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
 {
-	struct smc_link_group *lgr = conn->lgr;
-
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
-			       conn->sndbuf_desc, DMA_TO_DEVICE);
+	smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
 }
 
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
 {
-	struct smc_link_group *lgr = conn->lgr;
-
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
-				  conn->sndbuf_desc, DMA_TO_DEVICE);
+	smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
 }
 
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
@@ -1198,7 +1196,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
 
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+	smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK],
 			       conn->rmb_desc, DMA_FROM_DEVICE);
 }
 
@@ -1208,7 +1206,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
 
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+	smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK],
 				  conn->rmb_desc, DMA_FROM_DEVICE);
 }
 
@@ -1245,15 +1243,16 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
 }
 
 /* add a new rtoken from peer */
-int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
 {
+	struct smc_link_group *lgr = smc_get_lgr(lnk);
 	u64 dma_addr = be64_to_cpu(nw_vaddr);
 	u32 rkey = ntohl(nw_rkey);
 	int i;
 
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
-		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
-		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
+		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+		    lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
 			/* already in list */
 			return i;
@@ -1262,22 +1261,23 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
 	i = smc_rmb_reserve_rtoken_idx(lgr);
 	if (i < 0)
 		return i;
-	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
-	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
+	lgr->rtokens[i][lnk->link_idx].rkey = rkey;
+	lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
 	return i;
 }
 
 /* delete an rtoken */
-int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
 {
+	struct smc_link_group *lgr = smc_get_lgr(lnk);
 	u32 rkey = ntohl(nw_rkey);
 	int i;
 
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
-		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
+		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
-			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
-			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
+			lgr->rtokens[i][lnk->link_idx].rkey = 0;
+			lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
 
 			clear_bit(i, lgr->rtokens_used_mask);
 			return 0;
@@ -1290,7 +1290,7 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
 			    struct smc_clc_msg_accept_confirm *clc)
 {
-	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
+	conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr,
 					  clc->rmb_rkey);
 	if (conn->rtoken_idx < 0)
 		return conn->rtoken_idx;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c459b0639bf3..c71c35a3596c 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -116,6 +116,7 @@ struct smc_link {
 	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
 	u8			link_idx;	/* index in lgr link array */
+	struct smc_link_group	*lgr;		/* parent link group */
 
 	enum smc_link_state	state;		/* state of link */
 	struct workqueue_struct *llc_wq;	/* single thread work queue */
@@ -303,8 +304,8 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd);
 int smc_uncompress_bufsize(u8 compressed);
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
 			    struct smc_clc_msg_accept_confirm *clc);
-int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
-int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey);
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
@@ -319,6 +320,6 @@ void smc_core_exit(void);
 
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
-	return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+	return link->lgr;
 }
 #endif
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 440f9e319a38..c090678a3e5a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -389,15 +389,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr)
 	ib_dereg_mr(mr);
 }
 
-static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
+static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
 {
 	unsigned int offset = 0;
 	int sg_num;
 
 	/* map the largest prefix of a dma mapped SG list */
-	sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK],
-			      buf_slot->sgt[SMC_SINGLE_LINK].sgl,
-			      buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+	sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
+			      buf_slot->sgt[link_idx].sgl,
+			      buf_slot->sgt[link_idx].orig_nents,
 			      &offset, PAGE_SIZE);
 
 	return sg_num;
@@ -405,29 +405,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
 
 /* Allocate a memory region and map the dma mapped SG list of buf_slot */
 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
-			     struct smc_buf_desc *buf_slot)
+			     struct smc_buf_desc *buf_slot, u8 link_idx)
 {
-	if (buf_slot->mr_rx[SMC_SINGLE_LINK])
+	if (buf_slot->mr_rx[link_idx])
 		return 0; /* already done */
 
-	buf_slot->mr_rx[SMC_SINGLE_LINK] =
+	buf_slot->mr_rx[link_idx] =
 		ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
-	if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) {
+	if (IS_ERR(buf_slot->mr_rx[link_idx])) {
 		int rc;
 
-		rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]);
-		buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL;
+		rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
+		buf_slot->mr_rx[link_idx] = NULL;
 		return rc;
 	}
 
-	if (smc_ib_map_mr_sg(buf_slot) != 1)
+	if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
 		return -EINVAL;
 
 	return 0;
 }
 
 /* synchronize buffer usage for cpu access */
-void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
 			    struct smc_buf_desc *buf_slot,
 			    enum dma_data_direction data_direction)
 {
@@ -435,11 +435,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
 	unsigned int i;
 
 	/* for now there is just one DMA address */
-	for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
-		    buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
+	for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+		    buf_slot->sgt[lnk->link_idx].nents, i) {
 		if (!sg_dma_len(sg))
 			break;
-		ib_dma_sync_single_for_cpu(smcibdev->ibdev,
+		ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
 					   sg_dma_address(sg),
 					   sg_dma_len(sg),
 					   data_direction);
@@ -447,7 +447,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
 }
 
 /* synchronize buffer usage for device access */
-void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
 			       struct smc_buf_desc *buf_slot,
 			       enum dma_data_direction data_direction)
 {
@@ -455,11 +455,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
 	unsigned int i;
 
 	/* for now there is just one DMA address */
-	for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
-		    buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
+	for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+		    buf_slot->sgt[lnk->link_idx].nents, i) {
 		if (!sg_dma_len(sg))
 			break;
-		ib_dma_sync_single_for_device(smcibdev->ibdev,
+		ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
 					      sg_dma_address(sg),
 					      sg_dma_len(sg),
 					      data_direction);
@@ -467,15 +467,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
 }
 
 /* Map a new TX or RX buffer SG-table to DMA */
-int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
+int smc_ib_buf_map_sg(struct smc_link *lnk,
 		      struct smc_buf_desc *buf_slot,
 		      enum dma_data_direction data_direction)
 {
 	int mapped_nents;
 
-	mapped_nents = ib_dma_map_sg(smcibdev->ibdev,
-				     buf_slot->sgt[SMC_SINGLE_LINK].sgl,
-				     buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+	mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
+				     buf_slot->sgt[lnk->link_idx].sgl,
+				     buf_slot->sgt[lnk->link_idx].orig_nents,
 				     data_direction);
 	if (!mapped_nents)
 		return -ENOMEM;
@@ -483,18 +483,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
 	return mapped_nents;
 }
 
-void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
 			 struct smc_buf_desc *buf_slot,
 			 enum dma_data_direction data_direction)
 {
-	if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address)
+	if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
 		return; /* already unmapped */
 
-	ib_dma_unmap_sg(smcibdev->ibdev,
-			buf_slot->sgt[SMC_SINGLE_LINK].sgl,
-			buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+	ib_dma_unmap_sg(lnk->smcibdev->ibdev,
+			buf_slot->sgt[lnk->link_idx].sgl,
+			buf_slot->sgt[lnk->link_idx].orig_nents,
 			data_direction);
-	buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
+	buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
 }
 
 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 5c2b115d36da..e6a696ae15f3 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -59,10 +59,10 @@ struct smc_link;
 int smc_ib_register_client(void) __init;
 void smc_ib_unregister_client(void);
 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
+int smc_ib_buf_map_sg(struct smc_link *lnk,
 		      struct smc_buf_desc *buf_slot,
 		      enum dma_data_direction data_direction);
-void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
 			 struct smc_buf_desc *buf_slot,
 			 enum dma_data_direction data_direction);
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
@@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk);
 int smc_ib_modify_qp_reset(struct smc_link *lnk);
 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
-			     struct smc_buf_desc *buf_slot);
+			     struct smc_buf_desc *buf_slot, u8 link_idx);
 void smc_ib_put_memory_region(struct ib_mr *mr);
-void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
 			    struct smc_buf_desc *buf_slot,
 			    enum dma_data_direction data_direction);
-void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
 			       struct smc_buf_desc *buf_slot,
 			       enum dma_data_direction data_direction);
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 0e52aab53d97..34d0752ba6af 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -231,9 +231,9 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
 	rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
 	rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
 	rkeyllc->rtoken[0].rmb_key =
-		htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+		htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
 	rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
-		(u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+		(u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl));
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -256,7 +256,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 	rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
 	rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
 	rkeyllc->num_rkeys = 1;
-	rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+	rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -501,7 +501,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 					    SMC_LLC_FLAG_RKEY_NEG;
 		complete(&link->llc_confirm_rkey);
 	} else {
-		rc = smc_rtoken_add(smc_get_lgr(link),
+		rc = smc_rtoken_add(link,
 				    llc->rtoken[0].rmb_vaddr,
 				    llc->rtoken[0].rmb_key);
 
@@ -539,7 +539,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
 	} else {
 		max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
 		for (i = 0; i < max; i++) {
-			if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i]))
+			if (smc_rtoken_delete(link, llc->rkey[i]))
 				err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
 		}
 
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 9f1ade86d70e..d74bfe6a90f1 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -269,19 +269,18 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
 			     int num_sges, struct ib_rdma_wr *rdma_wr)
 {
 	struct smc_link_group *lgr = conn->lgr;
-	struct smc_link *link;
+	struct smc_link *link = conn->lnk;
 	int rc;
 
-	link = &lgr->lnk[SMC_SINGLE_LINK];
 	rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
 	rdma_wr->wr.num_sge = num_sges;
 	rdma_wr->remote_addr =
-		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+		lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
 		/* RMBE within RMB */
 		conn->tx_off +
 		/* offset within RMBE */
 		peer_rmbe_offset;
-	rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+	rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
 	rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
 	if (rc)
 		smc_lgr_terminate_sched(lgr);
@@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
 			       size_t dst_off, size_t dst_len,
 			       struct smc_rdma_wr *wr_rdma_buf)
 {
+	struct smc_link *link = conn->lnk;
+
 	dma_addr_t dma_addr =
-		sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
+		sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
 	int src_len_sum = src_len, dst_len_sum = dst_len;
 	int sent_count = src_off;
 	int srcchunk, dstchunk;
@@ -507,7 +508,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 	if (!pflags->urg_data_present) {
 		rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
 		if (rc) {
-			smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+			smc_wr_tx_put_slot(conn->lnk,
 					   (struct smc_wr_tx_pend_priv *)pend);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From b9247544c1bccfe1b74ddf1dade719a69946cbb1 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:41 +0200
Subject: net/smc: convert static link ID instances to support multiple links

As a preparation for the support of multiple links remove the usage of
a static link id (SMC_SINGLE_LINK) and allow dynamic link ids.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  54 ++++++---
 net/smc/smc_clc.h  |   1 +
 net/smc/smc_core.c | 332 ++++++++++++++++++++++++++++++++++++-----------------
 net/smc/smc_core.h |  37 +++---
 net/smc/smc_llc.c  |   2 +
 5 files changed, 291 insertions(+), 135 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6e4bad8c64a8..890dc6422f8c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -338,28 +338,48 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 }
 
 /* register a new rmb, send confirm_rkey msg to register with peer */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
-		       bool conf_rkey)
+static int smcr_link_reg_rmb(struct smc_link *link,
+			     struct smc_buf_desc *rmb_desc, bool conf_rkey)
 {
-	if (!rmb_desc->wr_reg) {
+	if (!rmb_desc->is_reg_mr[link->link_idx]) {
 		/* register memory region for new rmb */
 		if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
-			rmb_desc->regerr = 1;
+			rmb_desc->is_reg_err = true;
 			return -EFAULT;
 		}
-		rmb_desc->wr_reg = 1;
+		rmb_desc->is_reg_mr[link->link_idx] = true;
 	}
 	if (!conf_rkey)
 		return 0;
+
 	/* exchange confirm_rkey msg with peer */
-	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
-		rmb_desc->regerr = 1;
-		return -EFAULT;
+	if (!rmb_desc->is_conf_rkey) {
+		if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
+			rmb_desc->is_reg_err = true;
+			return -EFAULT;
+		}
+		rmb_desc->is_conf_rkey = true;
+	}
+	return 0;
+}
+
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr,
+			     struct smc_buf_desc *rmb_desc)
+{
+	int i, rc;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
+			continue;
+		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true);
+		if (rc)
+			return rc;
 	}
 	return 0;
 }
 
-static int smc_clnt_conf_first_link(struct smc_sock *smc)
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 {
 	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link *link = smc->conn.lnk;
@@ -387,7 +407,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 
 	smc_wr_remember_qp_attr(link);
 
-	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK response over RoCE fabric */
@@ -632,7 +652,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
 						 ini->cln_first_contact);
 	} else {
-		if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
+		if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc))
 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
 						 ini->cln_first_contact);
 	}
@@ -647,7 +667,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 
 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
 		/* QP confirmation over RoCE fabric */
-		reason_code = smc_clnt_conf_first_link(smc);
+		reason_code = smcr_clnt_conf_first_link(smc);
 		if (reason_code)
 			return smc_connect_abort(smc, reason_code,
 						 ini->cln_first_contact);
@@ -997,14 +1017,14 @@ void smc_close_non_accepted(struct sock *sk)
 	sock_put(sk); /* final sock_put */
 }
 
-static int smc_serv_conf_first_link(struct smc_sock *smc)
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
 {
 	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link *link = smc->conn.lnk;
 	int rest;
 	int rc;
 
-	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -1189,10 +1209,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
 /* listen worker: register buffers */
 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
 {
-	struct smc_link *link = new_smc->conn.lnk;
+	struct smc_connection *conn = &new_smc->conn;
 
 	if (local_contact != SMC_FIRST_CONTACT) {
-		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
+		if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc))
 			return SMC_CLC_DECL_ERR_REGRMB;
 	}
 	smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -1222,7 +1242,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 			goto decline;
 		}
 		/* QP confirmation over RoCE fabric */
-		reason_code = smc_serv_conf_first_link(new_smc);
+		reason_code = smcr_serv_conf_first_link(new_smc);
 		if (reason_code)
 			goto decline;
 	}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index ca209272e5fa..4f2e150a2be1 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -44,6 +44,7 @@
 #define SMC_CLC_DECL_DIFFPREFIX	0x03070000  /* IP prefix / subnet mismatch    */
 #define SMC_CLC_DECL_GETVLANERR	0x03080000  /* err to get vlan id of ip device*/
 #define SMC_CLC_DECL_ISMVLANERR	0x03090000  /* err to reg vlan id on ism dev  */
+#define SMC_CLC_DECL_NOACTLINK	0x030a0000  /* no active smc-r link in lgr    */
 #define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
 #define SMC_CLC_DECL_PEERDECL	0x05000000  /* peer declined during handshake */
 #define SMC_CLC_DECL_INTERR	0x09990000  /* internal error		      */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1d695093f205..5df3f8f41d19 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -116,7 +116,7 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn)
  * Requires @conns_lock
  * Note that '0' is a reserved value and not assigned.
  */
-static void smc_lgr_register_conn(struct smc_connection *conn)
+static int smc_lgr_register_conn(struct smc_connection *conn)
 {
 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
 	static atomic_t nexttoken = ATOMIC_INIT(0);
@@ -133,10 +133,22 @@ static void smc_lgr_register_conn(struct smc_connection *conn)
 	smc_lgr_add_alert_token(conn);
 
 	/* assign the new connection to a link */
-	if (!conn->lgr->is_smcd)
-		conn->lnk = &conn->lgr->lnk[SMC_SINGLE_LINK];
+	if (!conn->lgr->is_smcd) {
+		struct smc_link *lnk;
+		int i;
 
+		/* tbd - link balancing */
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			lnk = &conn->lgr->lnk[i];
+			if (lnk->state == SMC_LNK_ACTIVATING ||
+			    lnk->state == SMC_LNK_ACTIVE)
+				conn->lnk = lnk;
+		}
+		if (!conn->lnk)
+			return SMC_CLC_DECL_NOACTLINK;
+	}
 	conn->lgr->conns_num++;
+	return 0;
 }
 
 /* Unregister connection and reset the alert token of the given connection<
@@ -202,8 +214,8 @@ static void smc_lgr_free_work(struct work_struct *work)
 						  struct smc_link_group,
 						  free_work);
 	spinlock_t *lgr_lock;
-	struct smc_link *lnk;
 	bool conns;
+	int i;
 
 	smc_lgr_list_head(lgr, &lgr_lock);
 	spin_lock_bh(lgr_lock);
@@ -220,25 +232,38 @@ static void smc_lgr_free_work(struct work_struct *work)
 	}
 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
 
-	lnk = &lgr->lnk[SMC_SINGLE_LINK];
 	if (!lgr->is_smcd && !lgr->terminating)	{
-		/* try to send del link msg, on error free lgr immediately */
-		if (lnk->state == SMC_LNK_ACTIVE &&
-		    !smcr_link_send_delete(lnk, true)) {
-			/* reschedule in case we never receive a response */
-			smc_lgr_schedule_free_work(lgr);
+		bool do_wait = false;
+
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *lnk = &lgr->lnk[i];
+			/* try to send del link msg, on err free immediately */
+			if (lnk->state == SMC_LNK_ACTIVE &&
+			    !smcr_link_send_delete(lnk, true)) {
+				/* reschedule in case we never receive a resp */
+				smc_lgr_schedule_free_work(lgr);
+				do_wait = true;
+			}
+		}
+		if (do_wait) {
 			spin_unlock_bh(lgr_lock);
-			return;
+			return; /* wait for resp, see smc_llc_rx_delete_link */
 		}
 	}
 	lgr->freeing = 1; /* this instance does the freeing, no new schedule */
 	spin_unlock_bh(lgr_lock);
 	cancel_delayed_work(&lgr->free_work);
 
-	if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
-		smc_llc_link_inactive(lnk);
 	if (lgr->is_smcd && !lgr->terminating)
 		smc_ism_signal_shutdown(lgr);
+	if (!lgr->is_smcd) {
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *lnk = &lgr->lnk[i];
+
+			if (lnk->state != SMC_LNK_INACTIVE)
+				smc_llc_link_inactive(lnk);
+		}
+	}
 	smc_lgr_free(lgr);
 }
 
@@ -417,29 +442,37 @@ out:
 	return rc;
 }
 
+static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
+			   struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+
+	if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
+		/* unregister rmb with peer */
+		smc_llc_do_delete_rkey(lnk, rmb_desc);
+		rmb_desc->is_conf_rkey = false;
+	}
+	if (rmb_desc->is_reg_err) {
+		/* buf registration failed, reuse not possible */
+		write_lock_bh(&lgr->rmbs_lock);
+		list_del(&rmb_desc->list);
+		write_unlock_bh(&lgr->rmbs_lock);
+
+		smc_buf_free(lgr, true, rmb_desc);
+	} else {
+		rmb_desc->used = 0;
+	}
+}
+
 static void smc_buf_unuse(struct smc_connection *conn,
 			  struct smc_link_group *lgr)
 {
 	if (conn->sndbuf_desc)
 		conn->sndbuf_desc->used = 0;
-	if (conn->rmb_desc) {
-		if (!conn->rmb_desc->regerr) {
-			if (!lgr->is_smcd && !list_empty(&lgr->list)) {
-				/* unregister rmb with peer */
-				smc_llc_do_delete_rkey(
-						conn->lnk,
-						conn->rmb_desc);
-			}
-			conn->rmb_desc->used = 0;
-		} else {
-			/* buf registration failed, reuse not possible */
-			write_lock_bh(&lgr->rmbs_lock);
-			list_del(&conn->rmb_desc->list);
-			write_unlock_bh(&lgr->rmbs_lock);
-
-			smc_buf_free(lgr, true, conn->rmb_desc);
-		}
-	}
+	if (conn->rmb_desc && lgr->is_smcd)
+		conn->rmb_desc->used = 0;
+	else if (conn->rmb_desc)
+		smcr_buf_unuse(conn->rmb_desc, conn->lnk);
 }
 
 /* remove a finished connection from its link group */
@@ -467,6 +500,8 @@ void smc_conn_free(struct smc_connection *conn)
 
 static void smcr_link_clear(struct smc_link *lnk)
 {
+	if (lnk->peer_qpn == 0)
+		return;
 	lnk->peer_qpn = 0;
 	smc_llc_link_clear(lnk);
 	smc_ib_modify_qp_reset(lnk);
@@ -482,17 +517,23 @@ static void smcr_link_clear(struct smc_link *lnk)
 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
 			  struct smc_buf_desc *buf_desc)
 {
-	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *lnk;
+	int i;
 
-	if (is_rmb) {
-		if (buf_desc->mr_rx[lnk->link_idx])
-			smc_ib_put_memory_region(
-					buf_desc->mr_rx[lnk->link_idx]);
-		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
-	} else {
-		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		lnk = &lgr->lnk[i];
+		if (!buf_desc->is_map_ib[lnk->link_idx])
+			continue;
+		if (is_rmb) {
+			if (buf_desc->mr_rx[lnk->link_idx])
+				smc_ib_put_memory_region(
+						buf_desc->mr_rx[lnk->link_idx]);
+			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
+		} else {
+			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+		}
+		sg_free_table(&buf_desc->sgt[lnk->link_idx]);
 	}
-	sg_free_table(&buf_desc->sgt[lnk->link_idx]);
 
 	if (buf_desc->pages)
 		__free_pages(buf_desc->pages, buf_desc->order);
@@ -551,6 +592,8 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
 /* remove a link group */
 static void smc_lgr_free(struct smc_link_group *lgr)
 {
+	int i;
+
 	smc_lgr_free_bufs(lgr);
 	if (lgr->is_smcd) {
 		if (!lgr->terminating) {
@@ -560,7 +603,11 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
 			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
-		smcr_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			if (lgr->lnk[i].state == SMC_LNK_INACTIVE)
+				continue;
+			smcr_link_clear(&lgr->lnk[i]);
+		}
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
@@ -628,16 +675,20 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
 
 static void smc_lgr_cleanup(struct smc_link_group *lgr)
 {
+	int i;
+
 	if (lgr->is_smcd) {
 		smc_ism_signal_shutdown(lgr);
 		smcd_unregister_all_dmbs(lgr);
 		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
 		put_device(&lgr->smcd->dev);
 	} else {
-		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *lnk = &lgr->lnk[i];
 
-		if (lnk->state != SMC_LNK_INACTIVE)
-			smc_llc_link_inactive(lnk);
+			if (lnk->state != SMC_LNK_INACTIVE)
+				smc_llc_link_inactive(lnk);
+		}
 	}
 }
 
@@ -650,6 +701,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
 	struct smc_connection *conn;
 	struct smc_sock *smc;
 	struct rb_node *node;
+	int i;
 
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
@@ -657,7 +709,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
 		cancel_delayed_work_sync(&lgr->free_work);
 	lgr->terminating = 1;
 	if (!lgr->is_smcd)
-		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+			smc_llc_link_inactive(&lgr->lnk[i]);
 
 	/* kill remaining link group connections */
 	read_lock_bh(&lgr->conns_lock);
@@ -703,14 +756,22 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct smc_link_group *lgr, *l;
 	LIST_HEAD(lgr_free_list);
+	int i;
 
 	spin_lock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
-		if (!lgr->is_smcd &&
-		    lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
-		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) {
-			list_move(&lgr->list, &lgr_free_list);
-			lgr->freeing = 1;
+		if (lgr->is_smcd)
+			continue;
+		/* tbd - terminate only when no more links are active */
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			if (lgr->lnk[i].state == SMC_LNK_INACTIVE ||
+			    lgr->lnk[i].state == SMC_LNK_DELETING)
+				continue;
+			if (lgr->lnk[i].smcibdev == smcibdev &&
+			    lgr->lnk[i].ibport == ibport) {
+				list_move(&lgr->list, &lgr_free_list);
+				lgr->freeing = 1;
+			}
 		}
 	}
 	spin_unlock_bh(&smc_lgr_list.lock);
@@ -775,6 +836,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 {
 	struct smc_link_group *lgr, *lg;
 	LIST_HEAD(lgr_free_list);
+	int i;
 
 	spin_lock_bh(&smc_lgr_list.lock);
 	if (!smcibdev) {
@@ -783,9 +845,12 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 			lgr->freeing = 1;
 	} else {
 		list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
-			if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) {
-				list_move(&lgr->list, &lgr_free_list);
-				lgr->freeing = 1;
+			for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+				if (lgr->lnk[i].smcibdev == smcibdev) {
+					list_move(&lgr->list, &lgr_free_list);
+					lgr->freeing = 1;
+					break;
+				}
 			}
 		}
 	}
@@ -857,15 +922,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr,
 			   struct smc_clc_msg_local *lcl,
 			   enum smc_lgr_role role, u32 clcqpn)
 {
-	return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
-		       SMC_SYSTEMID_LEN) &&
-		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
-			SMC_GID_SIZE) &&
-		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
-			sizeof(lcl->mac)) &&
-		lgr->role == role &&
-		(lgr->role == SMC_SERV ||
-		 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn);
+	int i;
+
+	if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
+	    lgr->role != role)
+		return false;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
+			continue;
+		if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
+		    !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
+		    !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
+			return true;
+	}
+	return false;
 }
 
 static bool smcd_lgr_match(struct smc_link_group *lgr,
@@ -906,15 +977,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 			/* link group found */
 			ini->cln_first_contact = SMC_REUSE_CONTACT;
 			conn->lgr = lgr;
-			smc_lgr_register_conn(conn); /* add smc conn to lgr */
-			if (delayed_work_pending(&lgr->free_work))
-				cancel_delayed_work(&lgr->free_work);
+			rc = smc_lgr_register_conn(conn); /* add conn to lgr */
 			write_unlock_bh(&lgr->conns_lock);
+			if (!rc && delayed_work_pending(&lgr->free_work))
+				cancel_delayed_work(&lgr->free_work);
 			break;
 		}
 		write_unlock_bh(&lgr->conns_lock);
 	}
 	spin_unlock_bh(lgr_lock);
+	if (rc)
+		return rc;
 
 	if (role == SMC_CLNT && !ini->srv_first_contact &&
 	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
@@ -932,8 +1005,10 @@ create:
 			goto out;
 		lgr = conn->lgr;
 		write_lock_bh(&lgr->conns_lock);
-		smc_lgr_register_conn(conn); /* add smc conn to lgr */
+		rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */
 		write_unlock_bh(&lgr->conns_lock);
+		if (rc)
+			goto out;
 	}
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
@@ -1006,12 +1081,55 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)
 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
 }
 
+/* map an rmb buf to a link */
+static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+			     struct smc_link *lnk)
+{
+	int rc;
+
+	if (buf_desc->is_map_ib[lnk->link_idx])
+		return 0;
+
+	rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
+	if (rc)
+		return rc;
+	sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
+		   buf_desc->cpu_addr, buf_desc->len);
+
+	/* map sg table to DMA address */
+	rc = smc_ib_buf_map_sg(lnk, buf_desc,
+			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	/* SMC protocol depends on mapping to one DMA address only */
+	if (rc != 1) {
+		rc = -EAGAIN;
+		goto free_table;
+	}
+
+	/* create a new memory region for the RMB */
+	if (is_rmb) {
+		rc = smc_ib_get_memory_region(lnk->roce_pd,
+					      IB_ACCESS_REMOTE_WRITE |
+					      IB_ACCESS_LOCAL_WRITE,
+					      buf_desc, lnk->link_idx);
+		if (rc)
+			goto buf_unmap;
+		smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE);
+	}
+	buf_desc->is_map_ib[lnk->link_idx] = true;
+	return 0;
+
+buf_unmap:
+	smc_ib_buf_unmap_sg(lnk, buf_desc,
+			    is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+free_table:
+	sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+	return rc;
+}
+
 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 						bool is_rmb, int bufsize)
 {
 	struct smc_buf_desc *buf_desc;
-	struct smc_link *lnk;
-	int rc;
 
 	/* try to alloc a new buffer */
 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
@@ -1028,40 +1146,32 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 		return ERR_PTR(-EAGAIN);
 	}
 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
+	buf_desc->len = bufsize;
+	return buf_desc;
+}
 
-	/* build the sg table from the pages */
-	lnk = &lgr->lnk[SMC_SINGLE_LINK];
-	rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
-	if (rc) {
-		smc_buf_free(lgr, is_rmb, buf_desc);
-		return ERR_PTR(rc);
-	}
-	sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
-		   buf_desc->cpu_addr, bufsize);
+/* map buf_desc on all usable links,
+ * unused buffers stay mapped as long as the link is up
+ */
+static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
+				     struct smc_buf_desc *buf_desc, bool is_rmb)
+{
+	int i, rc = 0;
 
-	/* map sg table to DMA address */
-	rc = smc_ib_buf_map_sg(lnk, buf_desc,
-			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
-	/* SMC protocol depends on mapping to one DMA address only */
-	if (rc != 1)  {
-		smc_buf_free(lgr, is_rmb, buf_desc);
-		return ERR_PTR(-EAGAIN);
-	}
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		struct smc_link *lnk = &lgr->lnk[i];
 
-	/* create a new memory region for the RMB */
-	if (is_rmb) {
-		rc = smc_ib_get_memory_region(lnk->roce_pd,
-					      IB_ACCESS_REMOTE_WRITE |
-					      IB_ACCESS_LOCAL_WRITE,
-					      buf_desc, lnk->link_idx);
-		if (rc) {
-			smc_buf_free(lgr, is_rmb, buf_desc);
-			return ERR_PTR(rc);
+		if (lnk->state != SMC_LNK_ACTIVE &&
+		    lnk->state != SMC_LNK_ACTIVATING)
+			continue;
+		if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
+			smcr_buf_unuse(buf_desc, lnk);
+			rc = -ENOMEM;
+			goto out;
 		}
 	}
-
-	buf_desc->len = bufsize;
-	return buf_desc;
+out:
+	return rc;
 }
 
 #define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
@@ -1159,6 +1269,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	if (IS_ERR(buf_desc))
 		return -ENOMEM;
 
+	if (!is_smcd) {
+		if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
+			return -ENOMEM;
+		}
+	}
+
 	if (is_rmb) {
 		conn->rmb_desc = buf_desc;
 		conn->rmbe_size_short = bufsize_short;
@@ -1192,22 +1308,32 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
 
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
 {
-	struct smc_link_group *lgr = conn->lgr;
+	int i;
 
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_cpu(&lgr->lnk[SMC_SINGLE_LINK],
-			       conn->rmb_desc, DMA_FROM_DEVICE);
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE &&
+		    conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING)
+			continue;
+		smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
+				       DMA_FROM_DEVICE);
+	}
 }
 
 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
 {
-	struct smc_link_group *lgr = conn->lgr;
+	int i;
 
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
-	smc_ib_sync_sg_for_device(&lgr->lnk[SMC_SINGLE_LINK],
-				  conn->rmb_desc, DMA_FROM_DEVICE);
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE &&
+		    conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING)
+			continue;
+		smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
+					  DMA_FROM_DEVICE);
+	}
 }
 
 /* create the send and receive buffer for an SMC socket;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c71c35a3596c..66753ba23bc6 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -152,25 +152,32 @@ struct smc_buf_desc {
 	struct page		*pages;
 	int			len;		/* length of buffer */
 	u32			used;		/* currently used / unused */
-	u8			wr_reg	: 1;	/* mem region registered */
-	u8			regerr	: 1;	/* err during registration */
 	union {
 		struct { /* SMC-R */
-			struct sg_table		sgt[SMC_LINKS_PER_LGR_MAX];
-						/* virtual buffer */
-			struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX];
-						/* for rmb only: memory region
-						 * incl. rkey provided to peer
-						 */
-			u32			order;	/* allocation order */
+			struct sg_table	sgt[SMC_LINKS_PER_LGR_MAX];
+					/* virtual buffer */
+			struct ib_mr	*mr_rx[SMC_LINKS_PER_LGR_MAX];
+					/* for rmb only: memory region
+					 * incl. rkey provided to peer
+					 */
+			u32		order;	/* allocation order */
+
+			u8		is_conf_rkey;
+					/* confirm_rkey done */
+			u8		is_reg_mr[SMC_LINKS_PER_LGR_MAX];
+					/* mem region registered */
+			u8		is_map_ib[SMC_LINKS_PER_LGR_MAX];
+					/* mem region mapped to lnk */
+			u8		is_reg_err;
+					/* buffer registration err */
 		};
 		struct { /* SMC-D */
-			unsigned short		sba_idx;
-						/* SBA index number */
-			u64			token;
-						/* DMB token number */
-			dma_addr_t		dma_addr;
-						/* DMA address */
+			unsigned short	sba_idx;
+					/* SBA index number */
+			u64		token;
+					/* DMB token number */
+			dma_addr_t	dma_addr;
+					/* DMA address */
 		};
 	};
 };
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 34d0752ba6af..903ae068da3a 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -662,6 +662,8 @@ void smc_llc_link_deleting(struct smc_link *link)
 /* called in tasklet context */
 void smc_llc_link_inactive(struct smc_link *link)
 {
+	if (link->state == SMC_LNK_INACTIVE)
+		return;
 	link->state = SMC_LNK_INACTIVE;
 	cancel_delayed_work(&link->llc_testlink_wrk);
 	smc_wr_wakeup_reg_wait(link);
-- 
cgit v1.2.3


From e07d31dc16b0d77ff6b3f71cafe3a825fb80bed4 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:42 +0200
Subject: net/smc: multi-link support for smc_rmb_rtoken_handling()

Extend smc_rmb_rtoken_handling() and smc_rtoken_delete() to support
multiple links.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  4 ++--
 net/smc/smc_core.c | 14 ++++++++------
 net/smc/smc_core.h |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 890dc6422f8c..e39f6aedd3bd 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -640,7 +640,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 	if (ini->cln_first_contact == SMC_FIRST_CONTACT)
 		smc_link_save_peer_info(link, aclc);
 
-	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
+	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc))
 		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
 					 ini->cln_first_contact);
 
@@ -1231,7 +1231,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 	if (local_contact == SMC_FIRST_CONTACT)
 		smc_link_save_peer_info(link, cclc);
 
-	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
+	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) {
 		reason_code = SMC_CLC_DECL_ERR_RTOK;
 		goto decline;
 	}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 5df3f8f41d19..e8897d60b27f 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1392,19 +1392,20 @@ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
 	return i;
 }
 
-/* delete an rtoken */
+/* delete an rtoken from all links */
 int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
 {
 	struct smc_link_group *lgr = smc_get_lgr(lnk);
 	u32 rkey = ntohl(nw_rkey);
-	int i;
+	int i, j;
 
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
 		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
-			lgr->rtokens[i][lnk->link_idx].rkey = 0;
-			lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
-
+			for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
+				lgr->rtokens[i][j].rkey = 0;
+				lgr->rtokens[i][j].dma_addr = 0;
+			}
 			clear_bit(i, lgr->rtokens_used_mask);
 			return 0;
 		}
@@ -1414,9 +1415,10 @@ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
 
 /* save rkey and dma_addr received from peer during clc handshake */
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_link *lnk,
 			    struct smc_clc_msg_accept_confirm *clc)
 {
-	conn->rtoken_idx = smc_rtoken_add(conn->lnk, clc->rmb_dma_addr,
+	conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr,
 					  clc->rmb_rkey);
 	if (conn->rtoken_idx < 0)
 		return conn->rtoken_idx;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 66753ba23bc6..f68ba187ecf8 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -309,7 +309,7 @@ void smc_smcd_terminate_all(struct smcd_dev *dev);
 void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
 int smc_buf_create(struct smc_sock *smc, bool is_smcd);
 int smc_uncompress_bufsize(u8 compressed);
-int smc_rmb_rtoken_handling(struct smc_connection *conn,
+int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
 			    struct smc_clc_msg_accept_confirm *clc);
 int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
 int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
-- 
cgit v1.2.3


From d854fcbfaeda9748c85de296fbe07b7763a1939c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:43 +0200
Subject: net/smc: add new link state and related helpers

Before a link can be reused it must have been cleared. Lowest current
link state is INACTIVE, which does not mean that the link is already
cleared.
Add a new state UNUSED that is set when the link is cleared and can be
reused.
Add helper smc_llc_usable_link() to find an active link in a link group,
and smc_link_usable() to determine if a link is usable.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 36 +++++++++++++++++++-----------------
 net/smc/smc_core.h |  9 +++++++++
 net/smc/smc_llc.c  |  4 ++--
 net/smc/smc_llc.h  | 11 +++++++++++
 net/smc/smc_wr.c   |  2 +-
 5 files changed, 42 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e8897d60b27f..57890cbd4e8a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -260,7 +260,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			struct smc_link *lnk = &lgr->lnk[i];
 
-			if (lnk->state != SMC_LNK_INACTIVE)
+			if (smc_link_usable(lnk))
 				smc_llc_link_inactive(lnk);
 		}
 	}
@@ -286,7 +286,7 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr)
 		if (!link_id)	/* skip zero as link_id */
 			link_id = ++lgr->next_link_id;
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (lgr->lnk[i].state != SMC_LNK_INACTIVE &&
+			if (smc_link_usable(&lgr->lnk[i]) &&
 			    lgr->lnk[i].link_id == link_id)
 				continue;
 		}
@@ -350,6 +350,7 @@ clear_llc_lnk:
 out:
 	put_device(&ini->ib_dev->ibdev->dev);
 	memset(lnk, 0, sizeof(struct smc_link));
+	lnk->state = SMC_LNK_UNUSED;
 	if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
 		wake_up(&ini->ib_dev->lnks_deleted);
 	return rc;
@@ -500,6 +501,8 @@ void smc_conn_free(struct smc_connection *conn)
 
 static void smcr_link_clear(struct smc_link *lnk)
 {
+	struct smc_ib_device *smcibdev;
+
 	if (lnk->peer_qpn == 0)
 		return;
 	lnk->peer_qpn = 0;
@@ -510,8 +513,11 @@ static void smcr_link_clear(struct smc_link *lnk)
 	smc_ib_dealloc_protection_domain(lnk);
 	smc_wr_free_link_mem(lnk);
 	put_device(&lnk->smcibdev->ibdev->dev);
-	if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt))
-		wake_up(&lnk->smcibdev->lnks_deleted);
+	smcibdev = lnk->smcibdev;
+	memset(lnk, 0, sizeof(struct smc_link));
+	lnk->state = SMC_LNK_UNUSED;
+	if (!atomic_dec_return(&smcibdev->lnk_cnt))
+		wake_up(&smcibdev->lnks_deleted);
 }
 
 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
@@ -604,9 +610,8 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (lgr->lnk[i].state == SMC_LNK_INACTIVE)
-				continue;
-			smcr_link_clear(&lgr->lnk[i]);
+			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
+				smcr_link_clear(&lgr->lnk[i]);
 		}
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
@@ -686,7 +691,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			struct smc_link *lnk = &lgr->lnk[i];
 
-			if (lnk->state != SMC_LNK_INACTIVE)
+			if (smc_link_usable(lnk))
 				smc_llc_link_inactive(lnk);
 		}
 	}
@@ -764,7 +769,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 			continue;
 		/* tbd - terminate only when no more links are active */
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (lgr->lnk[i].state == SMC_LNK_INACTIVE ||
+			if (!smc_link_usable(&lgr->lnk[i]) ||
 			    lgr->lnk[i].state == SMC_LNK_DELETING)
 				continue;
 			if (lgr->lnk[i].smcibdev == smcibdev &&
@@ -1161,8 +1166,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 		struct smc_link *lnk = &lgr->lnk[i];
 
-		if (lnk->state != SMC_LNK_ACTIVE &&
-		    lnk->state != SMC_LNK_ACTIVATING)
+		if (!smc_link_usable(lnk))
 			continue;
 		if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
 			smcr_buf_unuse(buf_desc, lnk);
@@ -1294,14 +1298,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
 {
-	if (!conn->lgr || conn->lgr->is_smcd)
+	if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
 		return;
 	smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
 }
 
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
 {
-	if (!conn->lgr || conn->lgr->is_smcd)
+	if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
 		return;
 	smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
 }
@@ -1313,8 +1317,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-		if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE &&
-		    conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING)
+		if (!smc_link_usable(&conn->lgr->lnk[i]))
 			continue;
 		smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
 				       DMA_FROM_DEVICE);
@@ -1328,8 +1331,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
 	if (!conn->lgr || conn->lgr->is_smcd)
 		return;
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-		if (conn->lgr->lnk[i].state != SMC_LNK_ACTIVE &&
-		    conn->lgr->lnk[i].state != SMC_LNK_ACTIVATING)
+		if (!smc_link_usable(&conn->lgr->lnk[i]))
 			continue;
 		smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
 					  DMA_FROM_DEVICE);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index f68ba187ecf8..2b1960c8c8ce 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -32,6 +32,7 @@ enum smc_lgr_role {		/* possible roles of a link group */
 };
 
 enum smc_link_state {			/* possible states of a link */
+	SMC_LNK_UNUSED,		/* link is unused */
 	SMC_LNK_INACTIVE,	/* link is inactive */
 	SMC_LNK_ACTIVATING,	/* link is being activated */
 	SMC_LNK_ACTIVE,		/* link is active */
@@ -295,6 +296,14 @@ static inline struct smc_connection *smc_lgr_find_conn(
 	return res;
 }
 
+/* returns true if the specified link is usable */
+static inline bool smc_link_usable(struct smc_link *lnk)
+{
+	if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
+		return false;
+	return true;
+}
+
 struct smc_sock;
 struct smc_clc_msg_accept_confirm;
 struct smc_clc_msg_local;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 903ae068da3a..c267f5006faa 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -372,7 +372,7 @@ static void smc_llc_send_message_work(struct work_struct *work)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	if (llcwrk->link->state == SMC_LNK_INACTIVE)
+	if (!smc_link_usable(llcwrk->link))
 		goto out;
 	rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
 	if (rc)
@@ -562,7 +562,7 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 		return; /* short message */
 	if (llc->raw.hdr.length != sizeof(*llc))
 		return; /* invalid message */
-	if (link->state == SMC_LNK_INACTIVE)
+	if (!smc_link_usable(link))
 		return; /* link not active, drop msg */
 
 	switch (llc->raw.hdr.common.type) {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 461c0c3ef76e..08171131110c 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -35,6 +35,17 @@ enum smc_llc_msg_type {
 	SMC_LLC_DELETE_RKEY		= 0x09,
 };
 
+/* returns a usable link of the link group, or NULL */
+static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
+{
+	int i;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+		if (smc_link_usable(&lgr->lnk[i]))
+			return &lgr->lnk[i];
+	return NULL;
+}
+
 /* transmit */
 int smc_llc_send_confirm_link(struct smc_link *lnk,
 			      enum smc_llc_reqresp reqresp);
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 337ee52ad3d3..93223628c002 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -207,7 +207,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 	} else {
 		rc = wait_event_interruptible_timeout(
 			link->wr_tx_wait,
-			link->state == SMC_LNK_INACTIVE ||
+			!smc_link_usable(link) ||
 			lgr->terminating ||
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
-- 
cgit v1.2.3


From 1020e1ef53ceef715f2bc144eebbfe01e88effcf Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:44 +0200
Subject: net/smc: move testlink work to system work queue

The testlink work waits for a response to the testlink request and
blocks the single threaded llc_wq. This type of work does not have to be
serialized and can be moved to the system work queue.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index c267f5006faa..69cc0d65b437 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -613,14 +613,15 @@ static void smc_llc_testlink_work(struct work_struct *work)
 	/* receive TEST LINK response over RoCE fabric */
 	rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
 						       SMC_LLC_WAIT_TIME);
+	if (link->state != SMC_LNK_ACTIVE)
+		return;		/* link state changed */
 	if (rc <= 0) {
 		smc_lgr_terminate_sched(smc_get_lgr(link));
 		return;
 	}
 	next_interval = link->llc_testlink_time;
 out:
-	queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
-			   next_interval);
+	schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
 }
 
 int smc_llc_link_init(struct smc_link *link)
@@ -648,8 +649,8 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
 	link->state = SMC_LNK_ACTIVE;
 	if (testlink_time) {
 		link->llc_testlink_time = testlink_time * HZ;
-		queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
-				   link->llc_testlink_time);
+		schedule_delayed_work(&link->llc_testlink_wrk,
+				      link->llc_testlink_time);
 	}
 }
 
@@ -665,7 +666,7 @@ void smc_llc_link_inactive(struct smc_link *link)
 	if (link->state == SMC_LNK_INACTIVE)
 		return;
 	link->state = SMC_LNK_INACTIVE;
-	cancel_delayed_work(&link->llc_testlink_wrk);
+	cancel_delayed_work_sync(&link->llc_testlink_wrk);
 	smc_wr_wakeup_reg_wait(link);
 	smc_wr_wakeup_tx_wait(link);
 }
-- 
cgit v1.2.3


From 2140ac26f8f501d3cc8f1575e6419f1a50779496 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:45 +0200
Subject: net/smc: simplify link deactivation

Cancel the testlink worker during link clear processing and remove the
extra function smc_llc_link_inactive().

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  8 ++------
 net/smc/smc_llc.c  | 15 ++++-----------
 net/smc/smc_llc.h  |  1 -
 3 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 57890cbd4e8a..78ccfbf6e4af 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -261,7 +261,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 			struct smc_link *lnk = &lgr->lnk[i];
 
 			if (smc_link_usable(lnk))
-				smc_llc_link_inactive(lnk);
+				lnk->state = SMC_LNK_INACTIVE;
 		}
 	}
 	smc_lgr_free(lgr);
@@ -692,7 +692,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 			struct smc_link *lnk = &lgr->lnk[i];
 
 			if (smc_link_usable(lnk))
-				smc_llc_link_inactive(lnk);
+				lnk->state = SMC_LNK_INACTIVE;
 		}
 	}
 }
@@ -706,16 +706,12 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
 	struct smc_connection *conn;
 	struct smc_sock *smc;
 	struct rb_node *node;
-	int i;
 
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
 	if (!soft)
 		cancel_delayed_work_sync(&lgr->free_work);
 	lgr->terminating = 1;
-	if (!lgr->is_smcd)
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
-			smc_llc_link_inactive(&lgr->lnk[i]);
 
 	/* kill remaining link group connections */
 	read_lock_bh(&lgr->conns_lock);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 69cc0d65b437..2f03131c85fd 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -660,22 +660,15 @@ void smc_llc_link_deleting(struct smc_link *link)
 	smc_wr_wakeup_tx_wait(link);
 }
 
-/* called in tasklet context */
-void smc_llc_link_inactive(struct smc_link *link)
-{
-	if (link->state == SMC_LNK_INACTIVE)
-		return;
-	link->state = SMC_LNK_INACTIVE;
-	cancel_delayed_work_sync(&link->llc_testlink_wrk);
-	smc_wr_wakeup_reg_wait(link);
-	smc_wr_wakeup_tx_wait(link);
-}
-
 /* called in worker context */
 void smc_llc_link_clear(struct smc_link *link)
 {
 	flush_workqueue(link->llc_wq);
 	destroy_workqueue(link->llc_wq);
+	complete(&link->llc_testlink_resp);
+	cancel_delayed_work_sync(&link->llc_testlink_wrk);
+	smc_wr_wakeup_reg_wait(link);
+	smc_wr_wakeup_tx_wait(link);
 }
 
 /* register a new rtoken at the remote peer */
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 08171131110c..c2c9d48d079f 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -56,7 +56,6 @@ int smc_llc_send_delete_link(struct smc_link *link,
 int smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link, int testlink_time);
 void smc_llc_link_deleting(struct smc_link *link);
-void smc_llc_link_inactive(struct smc_link *link);
 void smc_llc_link_clear(struct smc_link *link);
 int smc_llc_do_confirm_rkey(struct smc_link *link,
 			    struct smc_buf_desc *rmb_desc);
-- 
cgit v1.2.3


From 6c8968c421e0e6bea8a78ee4fdd043d850cd5b26 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:46 +0200
Subject: net/smc: use worker to process incoming llc messages

Incoming llc messages are processed in irq tasklet context, and
a worker is used to send outgoing messages. The worker is needed
because getting a send buffer could result in a wait for a free buffer.

To make sure all incoming llc messages are processed in a serialized way
introduce an event queue and create a new queue entry for each message
which is queued to this event queue. A new worker processes the event
queue entries in order.
And remove the use of a separate worker to send outgoing llc messages
because the messages are processed in worker context already.
With this event queue the serialized llc_wq work queue is obsolete,
remove it.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |   4 +-
 net/smc/smc_core.h |   7 ++-
 net/smc/smc_llc.c  | 142 ++++++++++++++++++++++++++++++++---------------------
 net/smc/smc_llc.h  |   1 +
 4 files changed, 96 insertions(+), 58 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 78ccfbf6e4af..a1463da14614 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -412,7 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
 		       SMC_SYSTEMID_LEN);
-
+		INIT_LIST_HEAD(&lgr->llc_event_q);
+		spin_lock_init(&lgr->llc_event_q_lock);
 		link_idx = SMC_SINGLE_LINK;
 		lnk = &lgr->lnk[link_idx];
 		rc = smcr_link_init(lgr, lnk, link_idx, ini);
@@ -613,6 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
 				smcr_link_clear(&lgr->lnk[i]);
 		}
+		smc_llc_event_flush(lgr);
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 2b1960c8c8ce..6548e9a06f73 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -120,7 +120,6 @@ struct smc_link {
 	struct smc_link_group	*lgr;		/* parent link group */
 
 	enum smc_link_state	state;		/* state of link */
-	struct workqueue_struct *llc_wq;	/* single thread work queue */
 	struct completion	llc_confirm;	/* wait for rx of conf link */
 	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
 	int			llc_confirm_rc; /* rc from confirm link msg */
@@ -233,6 +232,12 @@ struct smc_link_group {
 			DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
 						/* used rtoken elements */
 			u8			next_link_id;
+			struct list_head	llc_event_q;
+						/* queue for llc events */
+			spinlock_t		llc_event_q_lock;
+						/* protects llc_event_q */
+			struct work_struct	llc_event_work;
+						/* llc event worker */
 		};
 		struct { /* SMC-D */
 			u64			peer_gid;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 2f03131c85fd..be74876a36ae 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -134,6 +134,12 @@ union smc_llc_msg {
 
 #define SMC_LLC_FLAG_RESP		0x80
 
+struct smc_llc_qentry {
+	struct list_head list;
+	struct smc_link *link;
+	union smc_llc_msg msg;
+};
+
 /********************************** send *************************************/
 
 struct smc_llc_tx_pend {
@@ -356,46 +362,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 	return rc;
 }
 
-struct smc_llc_send_work {
-	struct work_struct work;
-	struct smc_link *link;
-	int llclen;
-	union smc_llc_msg llcbuf;
-};
-
-/* worker that sends a prepared message */
-static void smc_llc_send_message_work(struct work_struct *work)
+/* schedule an llc send on link, may wait for buffers */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 {
-	struct smc_llc_send_work *llcwrk = container_of(work,
-						struct smc_llc_send_work, work);
 	struct smc_wr_tx_pend_priv *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	if (!smc_link_usable(llcwrk->link))
-		goto out;
-	rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
+	if (!smc_link_usable(link))
+		return -ENOLINK;
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		goto out;
-	memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen);
-	smc_wr_tx_send(llcwrk->link, pend);
-out:
-	kfree(llcwrk);
-}
-
-/* copy llcbuf and schedule an llc send on link */
-static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
-{
-	struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
-
-	if (!wrk)
-		return -ENOMEM;
-	INIT_WORK(&wrk->work, smc_llc_send_message_work);
-	wrk->link = link;
-	wrk->llclen = llclen;
-	memcpy(&wrk->llcbuf, llcbuf, llclen);
-	queue_work(link->llc_wq, &wrk->work);
-	return 0;
+		return rc;
+	memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+	return smc_wr_tx_send(link, pend);
 }
 
 /********************************* receive ***********************************/
@@ -452,7 +432,7 @@ static void smc_llc_rx_add_link(struct smc_link *link,
 					link->smcibdev->mac[link->ibport - 1],
 					link->gid, SMC_LLC_RESP);
 		}
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 	}
 }
 
@@ -474,7 +454,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 			/* server requests to delete this link, send response */
 			smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
 		}
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 		smc_lgr_terminate_sched(lgr);
 	}
 }
@@ -487,7 +467,7 @@ static void smc_llc_rx_test_link(struct smc_link *link,
 			complete(&link->llc_testlink_resp);
 	} else {
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 	}
 }
 
@@ -510,7 +490,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
 		if (rc < 0)
 			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 	}
 }
 
@@ -522,7 +502,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
 	} else {
 		/* ignore rtokens for other links, we have only one link */
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 	}
 }
 
@@ -549,21 +529,30 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
 		}
 
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc, sizeof(*llc));
+		smc_llc_send_message(link, llc);
 	}
 }
 
-static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+/* flush the llc event queue */
+void smc_llc_event_flush(struct smc_link_group *lgr)
 {
-	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
-	union smc_llc_msg *llc = buf;
+	struct smc_llc_qentry *qentry, *q;
+
+	spin_lock_bh(&lgr->llc_event_q_lock);
+	list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) {
+		list_del_init(&qentry->list);
+		kfree(qentry);
+	}
+	spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
+{
+	union smc_llc_msg *llc = &qentry->msg;
+	struct smc_link *link = qentry->link;
 
-	if (wc->byte_len < sizeof(*llc))
-		return; /* short message */
-	if (llc->raw.hdr.length != sizeof(*llc))
-		return; /* invalid message */
 	if (!smc_link_usable(link))
-		return; /* link not active, drop msg */
+		goto out;
 
 	switch (llc->raw.hdr.common.type) {
 	case SMC_LLC_TEST_LINK:
@@ -588,6 +577,54 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 		smc_llc_rx_delete_rkey(link, &llc->delete_rkey);
 		break;
 	}
+out:
+	kfree(qentry);
+}
+
+/* worker to process llc messages on the event queue */
+static void smc_llc_event_work(struct work_struct *work)
+{
+	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+						  llc_event_work);
+	struct smc_llc_qentry *qentry;
+
+again:
+	spin_lock_bh(&lgr->llc_event_q_lock);
+	if (!list_empty(&lgr->llc_event_q)) {
+		qentry = list_first_entry(&lgr->llc_event_q,
+					  struct smc_llc_qentry, list);
+		list_del_init(&qentry->list);
+		spin_unlock_bh(&lgr->llc_event_q_lock);
+		smc_llc_event_handler(qentry);
+		goto again;
+	}
+	spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+/* copy received msg and add it to the event queue */
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	struct smc_link_group *lgr = link->lgr;
+	struct smc_llc_qentry *qentry;
+	union smc_llc_msg *llc = buf;
+	unsigned long flags;
+
+	if (wc->byte_len < sizeof(*llc))
+		return; /* short message */
+	if (llc->raw.hdr.length != sizeof(*llc))
+		return; /* invalid message */
+
+	qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+	if (!qentry)
+		return;
+	qentry->link = link;
+	INIT_LIST_HEAD(&qentry->list);
+	memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
+	spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
+	list_add_tail(&qentry->list, &lgr->llc_event_q);
+	spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
+	schedule_work(&link->lgr->llc_event_work);
 }
 
 /***************************** worker, utils *********************************/
@@ -626,12 +663,6 @@ out:
 
 int smc_llc_link_init(struct smc_link *link)
 {
-	struct smc_link_group *lgr = smc_get_lgr(link);
-	link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
-					       *((u32 *)lgr->id),
-					       link->link_id);
-	if (!link->llc_wq)
-		return -ENOMEM;
 	init_completion(&link->llc_confirm);
 	init_completion(&link->llc_confirm_resp);
 	init_completion(&link->llc_add);
@@ -640,6 +671,7 @@ int smc_llc_link_init(struct smc_link *link)
 	init_completion(&link->llc_delete_rkey);
 	mutex_init(&link->llc_delete_rkey_mutex);
 	init_completion(&link->llc_testlink_resp);
+	INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work);
 	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
 	return 0;
 }
@@ -663,8 +695,6 @@ void smc_llc_link_deleting(struct smc_link *link)
 /* called in worker context */
 void smc_llc_link_clear(struct smc_link *link)
 {
-	flush_workqueue(link->llc_wq);
-	destroy_workqueue(link->llc_wq);
 	complete(&link->llc_testlink_resp);
 	cancel_delayed_work_sync(&link->llc_testlink_wrk);
 	smc_wr_wakeup_reg_wait(link);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index c2c9d48d079f..9de83495ad14 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -61,6 +61,7 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_do_delete_rkey(struct smc_link *link,
 			   struct smc_buf_desc *rmb_desc);
+void smc_llc_event_flush(struct smc_link_group *lgr);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From ef79d439cd124d9fb7258bb35d44c71aec11b829 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:47 +0200
Subject: net/smc: process llc responses in tasklet context

When llc responses are received then possible waiters for this response
are to be notified. This can be done in tasklet context, without to
use a work in the llc work queue. Move all code that handles llc
responses into smc_llc_rx_response().

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h |   8 +-
 net/smc/smc_llc.c  | 216 +++++++++++++++++++++++++++--------------------------
 2 files changed, 116 insertions(+), 108 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 6548e9a06f73..d785656b3489 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -129,10 +129,10 @@ struct smc_link {
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
-	struct completion	llc_confirm_rkey; /* wait 4 rx of cnf rkey */
-	int			llc_confirm_rkey_rc; /* rc from cnf rkey msg */
-	struct completion	llc_delete_rkey; /* wait 4 rx of del rkey */
-	int			llc_delete_rkey_rc; /* rc from del rkey msg */
+	struct completion	llc_confirm_rkey_resp; /* w4 rx of cnf rkey */
+	int			llc_confirm_rkey_resp_rc; /* rc from cnf rkey */
+	struct completion	llc_delete_rkey_resp; /* w4 rx of del rkey */
+	int			llc_delete_rkey_resp_rc; /* rc from del rkey */
 	struct mutex		llc_delete_rkey_mutex; /* serialize usage */
 };
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index be74876a36ae..265889c8b03b 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -384,27 +384,17 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
 				    struct smc_llc_msg_confirm_link *llc)
 {
 	struct smc_link_group *lgr = smc_get_lgr(link);
-	int conf_rc;
+	int conf_rc = 0;
 
 	/* RMBE eyecatchers are not supported */
-	if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
-		conf_rc = 0;
-	else
+	if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
 		conf_rc = ENOTSUPP;
 
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (lgr->role == SMC_SERV &&
-		    link->state == SMC_LNK_ACTIVATING) {
-			link->llc_confirm_resp_rc = conf_rc;
-			complete(&link->llc_confirm_resp);
-		}
-	} else {
-		if (lgr->role == SMC_CLNT &&
-		    link->state == SMC_LNK_ACTIVATING) {
-			link->llc_confirm_rc = conf_rc;
-			link->link_id = llc->link_num;
-			complete(&link->llc_confirm);
-		}
+	if (lgr->role == SMC_CLNT &&
+	    link->state == SMC_LNK_ACTIVATING) {
+		link->llc_confirm_rc = conf_rc;
+		link->link_id = llc->link_num;
+		complete(&link->llc_confirm);
 	}
 }
 
@@ -413,27 +403,22 @@ static void smc_llc_rx_add_link(struct smc_link *link,
 {
 	struct smc_link_group *lgr = smc_get_lgr(link);
 
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (link->state == SMC_LNK_ACTIVATING)
-			complete(&link->llc_add_resp);
-	} else {
-		if (link->state == SMC_LNK_ACTIVATING) {
-			complete(&link->llc_add);
-			return;
-		}
+	if (link->state == SMC_LNK_ACTIVATING) {
+		complete(&link->llc_add);
+		return;
+	}
 
-		if (lgr->role == SMC_SERV) {
-			smc_llc_prep_add_link(llc, link,
-					link->smcibdev->mac[link->ibport - 1],
-					link->gid, SMC_LLC_REQ);
+	if (lgr->role == SMC_SERV) {
+		smc_llc_prep_add_link(llc, link,
+				link->smcibdev->mac[link->ibport - 1],
+				link->gid, SMC_LLC_REQ);
 
-		} else {
-			smc_llc_prep_add_link(llc, link,
-					link->smcibdev->mac[link->ibport - 1],
-					link->gid, SMC_LLC_RESP);
-		}
-		smc_llc_send_message(link, llc);
+	} else {
+		smc_llc_prep_add_link(llc, link,
+				link->smcibdev->mac[link->ibport - 1],
+				link->gid, SMC_LLC_RESP);
 	}
+	smc_llc_send_message(link, llc);
 }
 
 static void smc_llc_rx_delete_link(struct smc_link *link,
@@ -441,34 +426,24 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 {
 	struct smc_link_group *lgr = smc_get_lgr(link);
 
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (lgr->role == SMC_SERV)
-			smc_lgr_schedule_free_work_fast(lgr);
+	smc_lgr_forget(lgr);
+	smc_llc_link_deleting(link);
+	if (lgr->role == SMC_SERV) {
+		/* client asks to delete this link, send request */
+		smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
 	} else {
-		smc_lgr_forget(lgr);
-		smc_llc_link_deleting(link);
-		if (lgr->role == SMC_SERV) {
-			/* client asks to delete this link, send request */
-			smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
-		} else {
-			/* server requests to delete this link, send response */
-			smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
-		}
-		smc_llc_send_message(link, llc);
-		smc_lgr_terminate_sched(lgr);
+		/* server requests to delete this link, send response */
+		smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
 	}
+	smc_llc_send_message(link, llc);
+	smc_lgr_terminate_sched(lgr);
 }
 
 static void smc_llc_rx_test_link(struct smc_link *link,
 				 struct smc_llc_msg_test_link *llc)
 {
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (link->state == SMC_LNK_ACTIVE)
-			complete(&link->llc_testlink_resp);
-	} else {
-		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc);
-	}
+	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_send_message(link, llc);
 }
 
 static void smc_llc_rx_confirm_rkey(struct smc_link *link,
@@ -476,34 +451,24 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 {
 	int rc;
 
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		link->llc_confirm_rkey_rc = llc->hd.flags &
-					    SMC_LLC_FLAG_RKEY_NEG;
-		complete(&link->llc_confirm_rkey);
-	} else {
-		rc = smc_rtoken_add(link,
-				    llc->rtoken[0].rmb_vaddr,
-				    llc->rtoken[0].rmb_key);
+	rc = smc_rtoken_add(link,
+			    llc->rtoken[0].rmb_vaddr,
+			    llc->rtoken[0].rmb_key);
 
-		/* ignore rtokens for other links, we have only one link */
+	/* ignore rtokens for other links, we have only one link */
 
-		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		if (rc < 0)
-			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
-		smc_llc_send_message(link, llc);
-	}
+	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	if (rc < 0)
+		llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+	smc_llc_send_message(link, llc);
 }
 
 static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
 				      struct smc_llc_msg_confirm_rkey_cont *llc)
 {
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		/* unused as long as we don't send this type of msg */
-	} else {
-		/* ignore rtokens for other links, we have only one link */
-		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc);
-	}
+	/* ignore rtokens for other links, we have only one link */
+	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_send_message(link, llc);
 }
 
 static void smc_llc_rx_delete_rkey(struct smc_link *link,
@@ -512,25 +477,19 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
 	u8 err_mask = 0;
 	int i, max;
 
-	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		link->llc_delete_rkey_rc = llc->hd.flags &
-					    SMC_LLC_FLAG_RKEY_NEG;
-		complete(&link->llc_delete_rkey);
-	} else {
-		max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
-		for (i = 0; i < max; i++) {
-			if (smc_rtoken_delete(link, llc->rkey[i]))
-				err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
-		}
-
-		if (err_mask) {
-			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
-			llc->err_mask = err_mask;
-		}
+	max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+	for (i = 0; i < max; i++) {
+		if (smc_rtoken_delete(link, llc->rkey[i]))
+			err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
+	}
 
-		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, llc);
+	if (err_mask) {
+		llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+		llc->err_mask = err_mask;
 	}
+
+	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_send_message(link, llc);
 }
 
 /* flush the llc event queue */
@@ -601,6 +560,49 @@ again:
 	spin_unlock_bh(&lgr->llc_event_q_lock);
 }
 
+/* process llc responses in tasklet context */
+static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc)
+{
+	int rc = 0;
+
+	switch (llc->raw.hdr.common.type) {
+	case SMC_LLC_TEST_LINK:
+		if (link->state == SMC_LNK_ACTIVE)
+			complete(&link->llc_testlink_resp);
+		break;
+	case SMC_LLC_CONFIRM_LINK:
+		if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
+			rc = ENOTSUPP;
+		if (link->lgr->role == SMC_SERV &&
+		    link->state == SMC_LNK_ACTIVATING) {
+			link->llc_confirm_resp_rc = rc;
+			complete(&link->llc_confirm_resp);
+		}
+		break;
+	case SMC_LLC_ADD_LINK:
+		if (link->state == SMC_LNK_ACTIVATING)
+			complete(&link->llc_add_resp);
+		break;
+	case SMC_LLC_DELETE_LINK:
+		if (link->lgr->role == SMC_SERV)
+			smc_lgr_schedule_free_work_fast(link->lgr);
+		break;
+	case SMC_LLC_CONFIRM_RKEY:
+		link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags &
+						 SMC_LLC_FLAG_RKEY_NEG;
+		complete(&link->llc_confirm_rkey_resp);
+		break;
+	case SMC_LLC_CONFIRM_RKEY_CONT:
+		/* unused as long as we don't send this type of msg */
+		break;
+	case SMC_LLC_DELETE_RKEY:
+		link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags &
+						SMC_LLC_FLAG_RKEY_NEG;
+		complete(&link->llc_delete_rkey_resp);
+		break;
+	}
+}
+
 /* copy received msg and add it to the event queue */
 static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 {
@@ -615,6 +617,12 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 	if (llc->raw.hdr.length != sizeof(*llc))
 		return; /* invalid message */
 
+	/* process responses immediately */
+	if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
+		smc_llc_rx_response(link, llc);
+		return;
+	}
+
 	qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
 	if (!qentry)
 		return;
@@ -667,8 +675,8 @@ int smc_llc_link_init(struct smc_link *link)
 	init_completion(&link->llc_confirm_resp);
 	init_completion(&link->llc_add);
 	init_completion(&link->llc_add_resp);
-	init_completion(&link->llc_confirm_rkey);
-	init_completion(&link->llc_delete_rkey);
+	init_completion(&link->llc_confirm_rkey_resp);
+	init_completion(&link->llc_delete_rkey_resp);
 	mutex_init(&link->llc_delete_rkey_mutex);
 	init_completion(&link->llc_testlink_resp);
 	INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work);
@@ -708,14 +716,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
 	int rc;
 
 	/* protected by mutex smc_create_lgr_pending */
-	reinit_completion(&link->llc_confirm_rkey);
+	reinit_completion(&link->llc_confirm_rkey_resp);
 	rc = smc_llc_send_confirm_rkey(link, rmb_desc);
 	if (rc)
 		return rc;
 	/* receive CONFIRM RKEY response from server over RoCE fabric */
-	rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
-						       SMC_LLC_WAIT_TIME);
-	if (rc <= 0 || link->llc_confirm_rkey_rc)
+	rc = wait_for_completion_interruptible_timeout(
+			&link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME);
+	if (rc <= 0 || link->llc_confirm_rkey_resp_rc)
 		return -EFAULT;
 	return 0;
 }
@@ -729,14 +737,14 @@ int smc_llc_do_delete_rkey(struct smc_link *link,
 	mutex_lock(&link->llc_delete_rkey_mutex);
 	if (link->state != SMC_LNK_ACTIVE)
 		goto out;
-	reinit_completion(&link->llc_delete_rkey);
+	reinit_completion(&link->llc_delete_rkey_resp);
 	rc = smc_llc_send_delete_rkey(link, rmb_desc);
 	if (rc)
 		goto out;
 	/* receive DELETE RKEY response from server over RoCE fabric */
-	rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey,
-						       SMC_LLC_WAIT_TIME);
-	if (rc <= 0 || link->llc_delete_rkey_rc)
+	rc = wait_for_completion_interruptible_timeout(
+			&link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME);
+	if (rc <= 0 || link->llc_delete_rkey_resp_rc)
 		rc = -EFAULT;
 	else
 		rc = 0;
-- 
cgit v1.2.3


From faca536008375bece23783e7382b5d0356c13ba5 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:48 +0200
Subject: net/smc: use mutex instead of rwlock_t to protect buffers

The locks for sndbufs and rmbs are never used from atomic context. Using
a mutex for these locks will allow to nest locks with other mutexes.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 22 +++++++++++-----------
 net/smc/smc_core.h |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a1463da14614..8a43d2948493 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -385,8 +385,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	lgr->freefast = 0;
 	lgr->freeing = 0;
 	lgr->vlan_id = ini->vlan_id;
-	rwlock_init(&lgr->sndbufs_lock);
-	rwlock_init(&lgr->rmbs_lock);
+	mutex_init(&lgr->sndbufs_lock);
+	mutex_init(&lgr->rmbs_lock);
 	rwlock_init(&lgr->conns_lock);
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
@@ -456,9 +456,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
 	}
 	if (rmb_desc->is_reg_err) {
 		/* buf registration failed, reuse not possible */
-		write_lock_bh(&lgr->rmbs_lock);
+		mutex_lock(&lgr->rmbs_lock);
 		list_del(&rmb_desc->list);
-		write_unlock_bh(&lgr->rmbs_lock);
+		mutex_unlock(&lgr->rmbs_lock);
 
 		smc_buf_free(lgr, true, rmb_desc);
 	} else {
@@ -1059,19 +1059,19 @@ int smc_uncompress_bufsize(u8 compressed)
  * buffer size; if not available, return NULL
  */
 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
-					     rwlock_t *lock,
+					     struct mutex *lock,
 					     struct list_head *buf_list)
 {
 	struct smc_buf_desc *buf_slot;
 
-	read_lock_bh(lock);
+	mutex_lock(lock);
 	list_for_each_entry(buf_slot, buf_list, list) {
 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
-			read_unlock_bh(lock);
+			mutex_unlock(lock);
 			return buf_slot;
 		}
 	}
-	read_unlock_bh(lock);
+	mutex_unlock(lock);
 	return NULL;
 }
 
@@ -1220,8 +1220,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	struct smc_link_group *lgr = conn->lgr;
 	struct list_head *buf_list;
 	int bufsize, bufsize_short;
+	struct mutex *lock;	/* lock buffer list */
 	int sk_buf_size;
-	rwlock_t *lock;
 
 	if (is_rmb)
 		/* use socket recv buffer size (w/o overhead) as start value */
@@ -1262,9 +1262,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 			continue;
 
 		buf_desc->used = 1;
-		write_lock_bh(lock);
+		mutex_lock(lock);
 		list_add(&buf_desc->list, buf_list);
-		write_unlock_bh(lock);
+		mutex_unlock(lock);
 		break; /* found */
 	}
 
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index d785656b3489..379ced490c49 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -205,9 +205,9 @@ struct smc_link_group {
 	unsigned short		vlan_id;	/* vlan id of link group */
 
 	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
-	rwlock_t		sndbufs_lock;	/* protects tx buffers */
+	struct mutex		sndbufs_lock;	/* protects tx buffers */
 	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
-	rwlock_t		rmbs_lock;	/* protects rx buffers */
+	struct mutex		rmbs_lock;	/* protects rx buffers */
 
 	u8			id[SMC_LGR_ID_SIZE];	/* unique lgr id */
 	struct delayed_work	free_work;	/* delayed freeing of an lgr */
-- 
cgit v1.2.3


From 00a049cfde95931c6832edad19d9a4be441cacf5 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 29 Apr 2020 17:10:49 +0200
Subject: net/smc: move llc layer related init and clear into smc_llc.c

Introduce smc_llc_lgr_init() and smc_llc_lgr_clear() to implement all
llc layer specific initialization and cleanup in module smc_llc.c.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  6 ++----
 net/smc/smc_core.c |  6 +++---
 net/smc/smc_core.h |  2 ++
 net/smc/smc_llc.c  | 26 +++++++++++++++++++++-----
 net/smc/smc_llc.h  |  5 +++--
 5 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e39f6aedd3bd..e859e3f420d9 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -381,7 +381,6 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr,
 
 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 {
-	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link *link = smc->conn.lnk;
 	int rest;
 	int rc;
@@ -433,7 +432,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 	if (rc < 0)
 		return SMC_CLC_DECL_TIMEOUT_AL;
 
-	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
+	smc_llc_link_active(link);
 
 	return 0;
 }
@@ -1019,7 +1018,6 @@ void smc_close_non_accepted(struct sock *sk)
 
 static int smcr_serv_conf_first_link(struct smc_sock *smc)
 {
-	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link *link = smc->conn.lnk;
 	int rest;
 	int rc;
@@ -1065,7 +1063,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
 	}
 
-	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
+	smc_llc_link_active(link);
 
 	return 0;
 }
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 8a43d2948493..db49f8cd5c95 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -412,8 +412,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
 		       SMC_SYSTEMID_LEN);
-		INIT_LIST_HEAD(&lgr->llc_event_q);
-		spin_lock_init(&lgr->llc_event_q_lock);
+		smc_llc_lgr_init(lgr, smc);
+
 		link_idx = SMC_SINGLE_LINK;
 		lnk = &lgr->lnk[link_idx];
 		rc = smcr_link_init(lgr, lnk, link_idx, ini);
@@ -614,7 +614,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
 				smcr_link_clear(&lgr->lnk[i]);
 		}
-		smc_llc_event_flush(lgr);
+		smc_llc_lgr_clear(lgr);
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 379ced490c49..b5781511063d 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -238,6 +238,8 @@ struct smc_link_group {
 						/* protects llc_event_q */
 			struct work_struct	llc_event_work;
 						/* llc event worker */
+			int			llc_testlink_time;
+						/* link keep alive time */
 		};
 		struct { /* SMC-D */
 			u64			peer_gid;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 265889c8b03b..e715dd6735ee 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -493,7 +493,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
 }
 
 /* flush the llc event queue */
-void smc_llc_event_flush(struct smc_link_group *lgr)
+static void smc_llc_event_flush(struct smc_link_group *lgr)
 {
 	struct smc_llc_qentry *qentry, *q;
 
@@ -669,6 +669,23 @@ out:
 	schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
 }
 
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
+{
+	struct net *net = sock_net(smc->clcsock->sk);
+
+	INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
+	INIT_LIST_HEAD(&lgr->llc_event_q);
+	spin_lock_init(&lgr->llc_event_q_lock);
+	lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time;
+}
+
+/* called after lgr was removed from lgr_list */
+void smc_llc_lgr_clear(struct smc_link_group *lgr)
+{
+	smc_llc_event_flush(lgr);
+	cancel_work_sync(&lgr->llc_event_work);
+}
+
 int smc_llc_link_init(struct smc_link *link)
 {
 	init_completion(&link->llc_confirm);
@@ -679,16 +696,15 @@ int smc_llc_link_init(struct smc_link *link)
 	init_completion(&link->llc_delete_rkey_resp);
 	mutex_init(&link->llc_delete_rkey_mutex);
 	init_completion(&link->llc_testlink_resp);
-	INIT_WORK(&link->lgr->llc_event_work, smc_llc_event_work);
 	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
 	return 0;
 }
 
-void smc_llc_link_active(struct smc_link *link, int testlink_time)
+void smc_llc_link_active(struct smc_link *link)
 {
 	link->state = SMC_LNK_ACTIVE;
-	if (testlink_time) {
-		link->llc_testlink_time = testlink_time * HZ;
+	if (link->lgr->llc_testlink_time) {
+		link->llc_testlink_time = link->lgr->llc_testlink_time * HZ;
 		schedule_delayed_work(&link->llc_testlink_wrk,
 				      link->llc_testlink_time);
 	}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 9de83495ad14..66063f22166b 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -53,15 +53,16 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 			  enum smc_llc_reqresp reqresp);
 int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp, bool orderly);
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
+void smc_llc_lgr_clear(struct smc_link_group *lgr);
 int smc_llc_link_init(struct smc_link *link);
-void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_active(struct smc_link *link);
 void smc_llc_link_deleting(struct smc_link *link);
 void smc_llc_link_clear(struct smc_link *link);
 int smc_llc_do_confirm_rkey(struct smc_link *link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_do_delete_rkey(struct smc_link *link,
 			   struct smc_buf_desc *rmb_desc);
-void smc_llc_event_flush(struct smc_link_group *lgr);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 29 Apr 2020 20:11:52 +0200
Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH

White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a
pointer to a full socket and acquires a reference if necessary.

To support it we need to extend the verifier to know that:

 (1) register storing the lookup result holds a pointer to socket, if
     lookup was done on SOCKMAP/SOCKHASH, and that

 (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation,
     which needs a corresponding reference release with bpf_sk_release.

On sock_map side, lookup handlers exposed via bpf_map_ops now bump
sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport,
the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was
updated to release the reference.

Sockets fetched from a map can be used in the same way as ones returned by
BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they
can be used with bpf_sk_assign to direct packets toward a socket on TC
ingress path.

Suggested-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com
---
 kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++----------
 net/core/filter.c     |  4 ++++
 net/core/sock_map.c   | 18 ++++++++++++++++--
 3 files changed, 55 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2b337e32aa94..70ad009577f8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_sk_release;
 }
 
-static bool is_acquire_function(enum bpf_func_id func_id)
+static bool may_be_acquire_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_lookup_tcp ||
 		func_id == BPF_FUNC_sk_lookup_udp ||
-		func_id == BPF_FUNC_skc_lookup_tcp;
+		func_id == BPF_FUNC_skc_lookup_tcp ||
+		func_id == BPF_FUNC_map_lookup_elem;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+				const struct bpf_map *map)
+{
+	enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+	if (func_id == BPF_FUNC_sk_lookup_tcp ||
+	    func_id == BPF_FUNC_sk_lookup_udp ||
+	    func_id == BPF_FUNC_skc_lookup_tcp)
+		return true;
+
+	if (func_id == BPF_FUNC_map_lookup_elem &&
+	    (map_type == BPF_MAP_TYPE_SOCKMAP ||
+	     map_type == BPF_MAP_TYPE_SOCKHASH))
+		return true;
+
+	return false;
 }
 
 static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_sock_map_update &&
 		    func_id != BPF_FUNC_map_delete_elem &&
 		    func_id != BPF_FUNC_msg_redirect_map &&
-		    func_id != BPF_FUNC_sk_select_reuseport)
+		    func_id != BPF_FUNC_sk_select_reuseport &&
+		    func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_SOCKHASH:
@@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_sock_hash_update &&
 		    func_id != BPF_FUNC_map_delete_elem &&
 		    func_id != BPF_FUNC_msg_redirect_hash &&
-		    func_id != BPF_FUNC_sk_select_reuseport)
+		    func_id != BPF_FUNC_sk_select_reuseport &&
+		    func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
 	/* A reference acquiring function cannot acquire
 	 * another refcounted ptr.
 	 */
-	if (is_acquire_function(func_id) && count)
+	if (may_be_acquire_function(func_id) && count)
 		return false;
 
 	/* We only support one arg being unreferenced at the moment,
@@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (is_ptr_cast_function(func_id)) {
 		/* For release_reference() */
 		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
-	} else if (is_acquire_function(func_id)) {
+	} else if (is_acquire_function(func_id, meta.map_ptr)) {
 		int id = acquire_reference_state(env, insn_idx);
 
 		if (id < 0)
@@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
 		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-			if (reg->map_ptr->inner_map_meta) {
+			const struct bpf_map *map = reg->map_ptr;
+
+			if (map->inner_map_meta) {
 				reg->type = CONST_PTR_TO_MAP;
-				reg->map_ptr = reg->map_ptr->inner_map_meta;
-			} else if (reg->map_ptr->map_type ==
-				   BPF_MAP_TYPE_XSKMAP) {
+				reg->map_ptr = map->inner_map_meta;
+			} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 				reg->type = PTR_TO_XDP_SOCK;
+			} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+				   map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+				reg->type = PTR_TO_SOCKET;
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
diff --git a/net/core/filter.c b/net/core/filter.c
index da3b7a72c37c..70b32723e6be 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8712,6 +8712,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
 
 	reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
 	if (!reuse) {
+		/* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+		if (sk_is_refcounted(selected_sk))
+			sock_put(selected_sk);
+
 		/* reuseport_array has only sk with non NULL sk_reuseport_cb.
 		 * The only (!reuse) case here is - the sk has already been
 		 * unhashed (e.g. by close()), so treat it as -ENOENT.
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index b08dfae10f88..00a26cf2cfe9 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 
 static void *sock_map_lookup(struct bpf_map *map, void *key)
 {
-	return __sock_map_lookup_elem(map, *(u32 *)key);
+	struct sock *sk;
+
+	sk = __sock_map_lookup_elem(map, *(u32 *)key);
+	if (!sk || !sk_fullsock(sk))
+		return NULL;
+	if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+		return NULL;
+	return sk;
 }
 
 static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
@@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
 
 static void *sock_hash_lookup(struct bpf_map *map, void *key)
 {
-	return __sock_hash_lookup_elem(map, key);
+	struct sock *sk;
+
+	sk = __sock_hash_lookup_elem(map, key);
+	if (!sk || !sk_fullsock(sk))
+		return NULL;
+	if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+		return NULL;
+	return sk;
 }
 
 static void sock_hash_release_progs(struct bpf_map *map)
-- 
cgit v1.2.3


From 99b2292ba21b47bbd97d33ab20892347ad2ac351 Mon Sep 17 00:00:00 2001
From: Yunjian Wang <wangyunjian@huawei.com>
Date: Thu, 30 Apr 2020 18:16:16 +0800
Subject: net: caif: Fix use correct return type for ndo_start_xmit()

The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix
the ndo function to use the correct type.

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/chnl_net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index a56628962852..79b6a04d8eb6 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -211,7 +211,8 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
 	}
 }
 
-static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb,
+				       struct net_device *dev)
 {
 	struct chnl_net *priv;
 	struct cfpkt *pkt = NULL;
-- 
cgit v1.2.3


From 555da9af827d95134656fa459c8f3ece04dd867a Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:38 +0200
Subject: net/smc: add event-based llc_flow framework

The new framework allows to start specific types of LLC control flows,
protects active flows and makes it possible to wait for flows to finish
before starting a new flow.
This mechanism is used for the LLC control layer to model flows like
'add link' or 'delete link' which need to send/receive several LLC
messages and are not allowed to get interrupted by the wrong type of
messages.
'Add link' or 'Delete link' messages arriving in the middle of a flow
are delayed and processed when the current flow finished.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |   2 +
 net/smc/smc_core.h |  24 ++++++++
 net/smc/smc_llc.c  | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_llc.h  |   8 +++
 4 files changed, 199 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index db49f8cd5c95..4867ddcfe0c6 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -263,6 +263,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 			if (smc_link_usable(lnk))
 				lnk->state = SMC_LNK_INACTIVE;
 		}
+		wake_up_interruptible_all(&lgr->llc_waiter);
 	}
 	smc_lgr_free(lgr);
 }
@@ -696,6 +697,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 			if (smc_link_usable(lnk))
 				lnk->state = SMC_LNK_INACTIVE;
 		}
+		wake_up_interruptible_all(&lgr->llc_waiter);
 	}
 }
 
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index b5781511063d..70399217ad6f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -197,6 +197,20 @@ struct smc_rtoken {				/* address/key of remote RMB */
 
 struct smcd_dev;
 
+enum smc_llc_flowtype {
+	SMC_LLC_FLOW_NONE	= 0,
+	SMC_LLC_FLOW_ADD_LINK	= 2,
+	SMC_LLC_FLOW_DEL_LINK	= 4,
+	SMC_LLC_FLOW_RKEY	= 6,
+};
+
+struct smc_llc_qentry;
+
+struct smc_llc_flow {
+	enum smc_llc_flowtype type;
+	struct smc_llc_qentry *qentry;
+};
+
 struct smc_link_group {
 	struct list_head	list;
 	struct rb_root		conns_all;	/* connection tree */
@@ -238,6 +252,16 @@ struct smc_link_group {
 						/* protects llc_event_q */
 			struct work_struct	llc_event_work;
 						/* llc event worker */
+			wait_queue_head_t	llc_waiter;
+						/* w4 next llc event */
+			struct smc_llc_flow	llc_flow_lcl;
+						/* llc local control field */
+			struct smc_llc_flow	llc_flow_rmt;
+						/* llc remote control field */
+			struct smc_llc_qentry	*delayed_event;
+						/* arrived when flow active */
+			spinlock_t		llc_flow_lock;
+						/* protects llc flow */
 			int			llc_testlink_time;
 						/* link keep alive time */
 		};
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e715dd6735ee..647cf1a2dfa5 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -140,6 +140,154 @@ struct smc_llc_qentry {
 	union smc_llc_msg msg;
 };
 
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
+{
+	struct smc_llc_qentry *qentry = flow->qentry;
+
+	flow->qentry = NULL;
+	return qentry;
+}
+
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow)
+{
+	struct smc_llc_qentry *qentry;
+
+	if (flow->qentry) {
+		qentry = flow->qentry;
+		flow->qentry = NULL;
+		kfree(qentry);
+	}
+}
+
+static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
+					   struct smc_llc_qentry *qentry)
+{
+	flow->qentry = qentry;
+}
+
+/* try to start a new llc flow, initiated by an incoming llc msg */
+static bool smc_llc_flow_start(struct smc_llc_flow *flow,
+			       struct smc_llc_qentry *qentry)
+{
+	struct smc_link_group *lgr = qentry->link->lgr;
+
+	spin_lock_bh(&lgr->llc_flow_lock);
+	if (flow->type) {
+		/* a flow is already active */
+		if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK ||
+		     qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) &&
+		    !lgr->delayed_event) {
+			lgr->delayed_event = qentry;
+		} else {
+			/* forget this llc request */
+			kfree(qentry);
+		}
+		spin_unlock_bh(&lgr->llc_flow_lock);
+		return false;
+	}
+	switch (qentry->msg.raw.hdr.common.type) {
+	case SMC_LLC_ADD_LINK:
+		flow->type = SMC_LLC_FLOW_ADD_LINK;
+		break;
+	case SMC_LLC_DELETE_LINK:
+		flow->type = SMC_LLC_FLOW_DEL_LINK;
+		break;
+	case SMC_LLC_CONFIRM_RKEY:
+	case SMC_LLC_DELETE_RKEY:
+		flow->type = SMC_LLC_FLOW_RKEY;
+		break;
+	default:
+		flow->type = SMC_LLC_FLOW_NONE;
+	}
+	if (qentry == lgr->delayed_event)
+		lgr->delayed_event = NULL;
+	spin_unlock_bh(&lgr->llc_flow_lock);
+	smc_llc_flow_qentry_set(flow, qentry);
+	return true;
+}
+
+/* start a new local llc flow, wait till current flow finished */
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+			  enum smc_llc_flowtype type)
+{
+	enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE;
+	int rc;
+
+	/* all flows except confirm_rkey and delete_rkey are exclusive,
+	 * confirm/delete rkey flows can run concurrently (local and remote)
+	 */
+	if (type == SMC_LLC_FLOW_RKEY)
+		allowed_remote = SMC_LLC_FLOW_RKEY;
+again:
+	if (list_empty(&lgr->list))
+		return -ENODEV;
+	spin_lock_bh(&lgr->llc_flow_lock);
+	if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+	    (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+	     lgr->llc_flow_rmt.type == allowed_remote)) {
+		lgr->llc_flow_lcl.type = type;
+		spin_unlock_bh(&lgr->llc_flow_lock);
+		return 0;
+	}
+	spin_unlock_bh(&lgr->llc_flow_lock);
+	rc = wait_event_interruptible_timeout(lgr->llc_waiter,
+			(lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+			 (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+			  lgr->llc_flow_rmt.type == allowed_remote)),
+			SMC_LLC_WAIT_TIME);
+	if (!rc)
+		return -ETIMEDOUT;
+	goto again;
+}
+
+/* finish the current llc flow */
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow)
+{
+	spin_lock_bh(&lgr->llc_flow_lock);
+	memset(flow, 0, sizeof(*flow));
+	flow->type = SMC_LLC_FLOW_NONE;
+	spin_unlock_bh(&lgr->llc_flow_lock);
+	if (!list_empty(&lgr->list) && lgr->delayed_event &&
+	    flow == &lgr->llc_flow_lcl)
+		schedule_work(&lgr->llc_event_work);
+	else
+		wake_up_interruptible(&lgr->llc_waiter);
+}
+
+/* lnk is optional and used for early wakeup when link goes down, useful in
+ * cases where we wait for a response on the link after we sent a request
+ */
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+				    struct smc_link *lnk,
+				    int time_out, u8 exp_msg)
+{
+	struct smc_llc_flow *flow = &lgr->llc_flow_lcl;
+
+	wait_event_interruptible_timeout(lgr->llc_waiter,
+					 (flow->qentry ||
+					  (lnk && !smc_link_usable(lnk)) ||
+					  list_empty(&lgr->list)),
+					 time_out);
+	if (!flow->qentry ||
+	    (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) {
+		smc_llc_flow_qentry_del(flow);
+		goto out;
+	}
+	if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) {
+		if (exp_msg == SMC_LLC_ADD_LINK &&
+		    flow->qentry->msg.raw.hdr.common.type ==
+		    SMC_LLC_DELETE_LINK) {
+			/* flow_start will delay the unexpected msg */
+			smc_llc_flow_start(&lgr->llc_flow_lcl,
+					   smc_llc_flow_qentry_clr(flow));
+			return NULL;
+		}
+		smc_llc_flow_qentry_del(flow);
+	}
+out:
+	return flow->qentry;
+}
+
 /********************************** send *************************************/
 
 struct smc_llc_tx_pend {
@@ -547,6 +695,16 @@ static void smc_llc_event_work(struct work_struct *work)
 						  llc_event_work);
 	struct smc_llc_qentry *qentry;
 
+	if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
+		if (smc_link_usable(lgr->delayed_event->link)) {
+			smc_llc_event_handler(lgr->delayed_event);
+		} else {
+			qentry = lgr->delayed_event;
+			lgr->delayed_event = NULL;
+			kfree(qentry);
+		}
+	}
+
 again:
 	spin_lock_bh(&lgr->llc_event_q_lock);
 	if (!list_empty(&lgr->llc_event_q)) {
@@ -676,6 +834,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 	INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
 	INIT_LIST_HEAD(&lgr->llc_event_q);
 	spin_lock_init(&lgr->llc_event_q_lock);
+	spin_lock_init(&lgr->llc_flow_lock);
+	init_waitqueue_head(&lgr->llc_waiter);
 	lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time;
 }
 
@@ -683,7 +843,12 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 void smc_llc_lgr_clear(struct smc_link_group *lgr)
 {
 	smc_llc_event_flush(lgr);
+	wake_up_interruptible_all(&lgr->llc_waiter);
 	cancel_work_sync(&lgr->llc_event_work);
+	if (lgr->delayed_event) {
+		kfree(lgr->delayed_event);
+		lgr->delayed_event = NULL;
+	}
 }
 
 int smc_llc_link_init(struct smc_link *link)
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 66063f22166b..49e99ff00ee7 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -63,6 +63,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_do_delete_rkey(struct smc_link *link,
 			   struct smc_buf_desc *rmb_desc);
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+			  enum smc_llc_flowtype type);
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+				    struct smc_link *lnk,
+				    int time_out, u8 exp_msg);
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From a6688d919b220bd714948e03bb3caa8a66895005 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:39 +0200
Subject: net/smc: enqueue all received LLC messages

Introduce smc_llc_enqueue() to enqueue LLC messages, and adapt
smc_llc_rx_handler() to enqueue all received LLC messages.
smc_llc_enqueue() also makes it possible to enqueue LLC messages from
local code.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 647cf1a2dfa5..a146b3b43580 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -719,11 +719,14 @@ again:
 }
 
 /* process llc responses in tasklet context */
-static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc)
+static void smc_llc_rx_response(struct smc_link *link,
+				struct smc_llc_qentry *qentry)
 {
+	u8 llc_type = qentry->msg.raw.hdr.common.type;
+	union smc_llc_msg *llc = &qentry->msg;
 	int rc = 0;
 
-	switch (llc->raw.hdr.common.type) {
+	switch (llc_type) {
 	case SMC_LLC_TEST_LINK:
 		if (link->state == SMC_LNK_ACTIVE)
 			complete(&link->llc_testlink_resp);
@@ -759,40 +762,49 @@ static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc)
 		complete(&link->llc_delete_rkey_resp);
 		break;
 	}
+	kfree(qentry);
 }
 
-/* copy received msg and add it to the event queue */
-static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
 {
-	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
 	struct smc_link_group *lgr = link->lgr;
 	struct smc_llc_qentry *qentry;
-	union smc_llc_msg *llc = buf;
 	unsigned long flags;
 
-	if (wc->byte_len < sizeof(*llc))
-		return; /* short message */
-	if (llc->raw.hdr.length != sizeof(*llc))
-		return; /* invalid message */
-
-	/* process responses immediately */
-	if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
-		smc_llc_rx_response(link, llc);
-		return;
-	}
-
 	qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
 	if (!qentry)
 		return;
 	qentry->link = link;
 	INIT_LIST_HEAD(&qentry->list);
 	memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
+
+	/* process responses immediately */
+	if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
+		smc_llc_rx_response(link, qentry);
+		return;
+	}
+
+	/* add requests to event queue */
 	spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
 	list_add_tail(&qentry->list, &lgr->llc_event_q);
 	spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
 	schedule_work(&link->lgr->llc_event_work);
 }
 
+/* copy received msg and add it to the event queue */
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+	union smc_llc_msg *llc = buf;
+
+	if (wc->byte_len < sizeof(*llc))
+		return; /* short message */
+	if (llc->raw.hdr.length != sizeof(*llc))
+		return; /* invalid message */
+
+	smc_llc_enqueue(link, llc);
+}
+
 /***************************** worker, utils *********************************/
 
 static void smc_llc_testlink_work(struct work_struct *work)
-- 
cgit v1.2.3


From 81e6e5e70df46bb5b205e53f2b7885e49a9d4974 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:40 +0200
Subject: net/smc: introduce link group type

Add a type field to the link group which reflects the current link group
redundancy state.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 70399217ad6f..51366a9f4980 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -197,6 +197,14 @@ struct smc_rtoken {				/* address/key of remote RMB */
 
 struct smcd_dev;
 
+enum smc_lgr_type {				/* redundancy state of lgr */
+	SMC_LGR_NONE,			/* no active links, lgr to be deleted */
+	SMC_LGR_SINGLE,			/* 1 active RNIC on each peer */
+	SMC_LGR_SYMMETRIC,		/* 2 active RNICs on each peer */
+	SMC_LGR_ASYMMETRIC_PEER,	/* local has 2, peer 1 active RNICs */
+	SMC_LGR_ASYMMETRIC_LOCAL,	/* local has 1, peer 2 active RNICs */
+};
+
 enum smc_llc_flowtype {
 	SMC_LLC_FLOW_NONE	= 0,
 	SMC_LLC_FLOW_ADD_LINK	= 2,
@@ -246,6 +254,8 @@ struct smc_link_group {
 			DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
 						/* used rtoken elements */
 			u8			next_link_id;
+			enum smc_lgr_type	type;
+						/* redundancy state */
 			struct list_head	llc_event_q;
 						/* queue for llc events */
 			spinlock_t		llc_event_q_lock;
-- 
cgit v1.2.3


From 92334cfcb3a2a102dc1b23513bbe2fca4347e2d6 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:41 +0200
Subject: net/smc: add logic to evaluate CONFIRM_LINK messages to LLC layer

Introduce smc_llc_eval_conf_link() to evaluate the CONFIRM_LINK message
contents. This implements this logic at the LLC layer. The function will
be used by af_smc.c to process the received LLC layer messages.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 11 +++++++++++
 net/smc/smc_llc.h |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index a146b3b43580..9248b90fe37e 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -946,6 +946,17 @@ out:
 	return rc;
 }
 
+/* evaluate confirm link request or response */
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+			   enum smc_llc_reqresp type)
+{
+	if (type == SMC_LLC_REQ)	/* SMC server assigns link_id */
+		qentry->link->link_id = qentry->msg.confirm_link.link_num;
+	if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
+		return -ENOTSUPP;
+	return 0;
+}
+
 /***************************** init, exit, misc ******************************/
 
 static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 49e99ff00ee7..637acf91ffb7 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -66,6 +66,8 @@ int smc_llc_do_delete_rkey(struct smc_link *link,
 int smc_llc_flow_initiate(struct smc_link_group *lgr,
 			  enum smc_llc_flowtype type);
 void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+			   enum smc_llc_reqresp type);
 struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 				    struct smc_link *lnk,
 				    int time_out, u8 exp_msg);
-- 
cgit v1.2.3


From 4667bb4aaabf87d6b97be1b4671b9db340a58cdc Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:42 +0200
Subject: net/smc: adapt SMC server code to use the LLC flow

Change the code that processes the SMC server part of connection
establishment to use the LLC flow framework (CONFIRM_LINK response
messages).

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 39 +++++++++++++++------------------------
 net/smc/smc_core.h |  3 ---
 net/smc/smc_llc.c  | 20 +++++---------------
 3 files changed, 20 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e859e3f420d9..ab3aef1ddfa4 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1019,9 +1019,11 @@ void smc_close_non_accepted(struct sock *sk)
 static int smcr_serv_conf_first_link(struct smc_sock *smc)
 {
 	struct smc_link *link = smc->conn.lnk;
-	int rest;
+	struct smc_llc_qentry *qentry;
 	int rc;
 
+	link->lgr->type = SMC_LGR_SINGLE;
+
 	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
@@ -1031,40 +1033,27 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 		return SMC_CLC_DECL_TIMEOUT_CL;
 
 	/* receive CONFIRM LINK response from client over the RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(
-		&link->llc_confirm_resp,
-		SMC_LLC_WAIT_FIRST_TIME);
-	if (rest <= 0) {
+	qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_CONFIRM_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
-
-	if (link->llc_confirm_resp_rc)
+	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+	if (rc)
 		return SMC_CLC_DECL_RMBE_EC;
 
-	/* send ADD LINK request to client over the RoCE fabric */
-	rc = smc_llc_send_add_link(link,
-				   link->smcibdev->mac[link->ibport - 1],
-				   link->gid, SMC_LLC_REQ);
-	if (rc < 0)
-		return SMC_CLC_DECL_TIMEOUT_AL;
-
-	/* receive ADD LINK response from client over the RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
-							 SMC_LLC_WAIT_TIME);
-	if (rest <= 0) {
-		struct smc_clc_msg_decline dclc;
-
-		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
-				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
-		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
-	}
+	/* confirm_rkey is implicit on 1st contact */
+	smc->conn.rmb_desc->is_conf_rkey = true;
 
 	smc_llc_link_active(link);
 
+	/* initial contact - try to establish second link */
+	/* tbd: call smc_llc_srv_add_link(link); */
 	return 0;
 }
 
@@ -1240,7 +1229,9 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 			goto decline;
 		}
 		/* QP confirmation over RoCE fabric */
+		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
 		reason_code = smcr_serv_conf_first_link(new_smc);
+		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
 		if (reason_code)
 			goto decline;
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 51366a9f4980..01a9cb885ef2 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -121,11 +121,8 @@ struct smc_link {
 
 	enum smc_link_state	state;		/* state of link */
 	struct completion	llc_confirm;	/* wait for rx of conf link */
-	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
 	int			llc_confirm_rc; /* rc from confirm link msg */
-	int			llc_confirm_resp_rc; /* rc from conf_resp msg */
 	struct completion	llc_add;	/* wait for rx of add link */
-	struct completion	llc_add_resp;	/* wait for rx of add link rsp*/
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 9248b90fe37e..5381b16fd482 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -724,26 +724,18 @@ static void smc_llc_rx_response(struct smc_link *link,
 {
 	u8 llc_type = qentry->msg.raw.hdr.common.type;
 	union smc_llc_msg *llc = &qentry->msg;
-	int rc = 0;
 
 	switch (llc_type) {
 	case SMC_LLC_TEST_LINK:
 		if (link->state == SMC_LNK_ACTIVE)
 			complete(&link->llc_testlink_resp);
 		break;
-	case SMC_LLC_CONFIRM_LINK:
-		if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
-			rc = ENOTSUPP;
-		if (link->lgr->role == SMC_SERV &&
-		    link->state == SMC_LNK_ACTIVATING) {
-			link->llc_confirm_resp_rc = rc;
-			complete(&link->llc_confirm_resp);
-		}
-		break;
 	case SMC_LLC_ADD_LINK:
-		if (link->state == SMC_LNK_ACTIVATING)
-			complete(&link->llc_add_resp);
-		break;
+	case SMC_LLC_CONFIRM_LINK:
+		/* assign responses to the local flow, we requested them */
+		smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
+		wake_up_interruptible(&link->lgr->llc_waiter);
+		return;
 	case SMC_LLC_DELETE_LINK:
 		if (link->lgr->role == SMC_SERV)
 			smc_lgr_schedule_free_work_fast(link->lgr);
@@ -866,9 +858,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 int smc_llc_link_init(struct smc_link *link)
 {
 	init_completion(&link->llc_confirm);
-	init_completion(&link->llc_confirm_resp);
 	init_completion(&link->llc_add);
-	init_completion(&link->llc_add_resp);
 	init_completion(&link->llc_confirm_rkey_resp);
 	init_completion(&link->llc_delete_rkey_resp);
 	mutex_init(&link->llc_delete_rkey_mutex);
-- 
cgit v1.2.3


From 0fb0b02bd6fd26cba38002be4a6bbcae2228fd44 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:43 +0200
Subject: net/smc: adapt SMC client code to use the LLC flow

Change the code that processes the SMC client part of connection
establishment to use the LLC flow framework (CONFIRM_LINK request
messages).

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 69 +++++++++++++++++++++++++++++++++------------------
 net/smc/smc_clc.h  |  1 +
 net/smc/smc_core.h |  3 ---
 net/smc/smc_llc.c  | 72 +++++++++++++++++++-----------------------------------
 4 files changed, 71 insertions(+), 74 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ab3aef1ddfa4..bd9662d06896 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -382,22 +382,24 @@ static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr,
 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 {
 	struct smc_link *link = smc->conn.lnk;
-	int rest;
+	struct smc_llc_qentry *qentry;
 	int rc;
 
+	link->lgr->type = SMC_LGR_SINGLE;
+
 	/* receive CONFIRM LINK request from server over RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(
-		&link->llc_confirm,
-		SMC_LLC_WAIT_FIRST_TIME);
-	if (rest <= 0) {
+	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_CONFIRM_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
-
-	if (link->llc_confirm_rc)
+	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+	if (rc)
 		return SMC_CLC_DECL_RMBE_EC;
 
 	rc = smc_ib_modify_qp_rts(link);
@@ -409,31 +411,30 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
+	/* confirm_rkey is implicit on 1st contact */
+	smc->conn.rmb_desc->is_conf_rkey = true;
+
 	/* send CONFIRM LINK response over RoCE fabric */
 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
 	if (rc < 0)
 		return SMC_CLC_DECL_TIMEOUT_CL;
 
-	/* receive ADD LINK request from server over RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
-							 SMC_LLC_WAIT_TIME);
-	if (rest <= 0) {
+	smc_llc_link_active(link);
+
+	/* optional 2nd link, receive ADD LINK request from server */
+	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_ADD_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
-		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
+		if (rc == -EAGAIN)
+			rc = 0; /* no DECLINE received, go with one link */
+		return rc;
 	}
-
-	/* send add link reject message, only one link supported for now */
-	rc = smc_llc_send_add_link(link,
-				   link->smcibdev->mac[link->ibport - 1],
-				   link->gid, SMC_LLC_RESP);
-	if (rc < 0)
-		return SMC_CLC_DECL_TIMEOUT_AL;
-
-	smc_llc_link_active(link);
-
+	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+	/* tbd: call smc_llc_cli_add_link(link, qentry); */
 	return 0;
 }
 
@@ -613,8 +614,8 @@ static int smc_connect_rdma(struct smc_sock *smc,
 			    struct smc_clc_msg_accept_confirm *aclc,
 			    struct smc_init_info *ini)
 {
+	int i, reason_code = 0;
 	struct smc_link *link;
-	int reason_code = 0;
 
 	ini->is_smcd = false;
 	ini->ib_lcl = &aclc->lcl;
@@ -627,10 +628,28 @@ static int smc_connect_rdma(struct smc_sock *smc,
 		mutex_unlock(&smc_client_lgr_pending);
 		return reason_code;
 	}
-	link = smc->conn.lnk;
 
 	smc_conn_save_peer_info(smc, aclc);
 
+	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+		link = smc->conn.lnk;
+	} else {
+		/* set link that was assigned by server */
+		link = NULL;
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *l = &smc->conn.lgr->lnk[i];
+
+			if (l->peer_qpn == ntoh24(aclc->qpn)) {
+				link = l;
+				break;
+			}
+		}
+		if (!link)
+			return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK,
+						 ini->cln_first_contact);
+		smc->conn.lnk = link;
+	}
+
 	/* create send buffer and rmb */
 	if (smc_buf_create(smc, false))
 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
@@ -666,7 +685,9 @@ static int smc_connect_rdma(struct smc_sock *smc,
 
 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
 		/* QP confirmation over RoCE fabric */
+		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
 		reason_code = smcr_clnt_conf_first_link(smc);
+		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
 		if (reason_code)
 			return smc_connect_abort(smc, reason_code,
 						 ini->cln_first_contact);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 4f2e150a2be1..465876701b75 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -45,6 +45,7 @@
 #define SMC_CLC_DECL_GETVLANERR	0x03080000  /* err to get vlan id of ip device*/
 #define SMC_CLC_DECL_ISMVLANERR	0x03090000  /* err to reg vlan id on ism dev  */
 #define SMC_CLC_DECL_NOACTLINK	0x030a0000  /* no active smc-r link in lgr    */
+#define SMC_CLC_DECL_NOSRVLINK	0x030b0000  /* SMC-R link from srv not found  */
 #define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
 #define SMC_CLC_DECL_PEERDECL	0x05000000  /* peer declined during handshake */
 #define SMC_CLC_DECL_INTERR	0x09990000  /* internal error		      */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 01a9cb885ef2..31237c4c0d93 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -120,9 +120,6 @@ struct smc_link {
 	struct smc_link_group	*lgr;		/* parent link group */
 
 	enum smc_link_state	state;		/* state of link */
-	struct completion	llc_confirm;	/* wait for rx of conf link */
-	int			llc_confirm_rc; /* rc from confirm link msg */
-	struct completion	llc_add;	/* wait for rx of add link */
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5381b16fd482..644e9ab0dec5 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -528,47 +528,6 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 
 /********************************* receive ***********************************/
 
-static void smc_llc_rx_confirm_link(struct smc_link *link,
-				    struct smc_llc_msg_confirm_link *llc)
-{
-	struct smc_link_group *lgr = smc_get_lgr(link);
-	int conf_rc = 0;
-
-	/* RMBE eyecatchers are not supported */
-	if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
-		conf_rc = ENOTSUPP;
-
-	if (lgr->role == SMC_CLNT &&
-	    link->state == SMC_LNK_ACTIVATING) {
-		link->llc_confirm_rc = conf_rc;
-		link->link_id = llc->link_num;
-		complete(&link->llc_confirm);
-	}
-}
-
-static void smc_llc_rx_add_link(struct smc_link *link,
-				struct smc_llc_msg_add_link *llc)
-{
-	struct smc_link_group *lgr = smc_get_lgr(link);
-
-	if (link->state == SMC_LNK_ACTIVATING) {
-		complete(&link->llc_add);
-		return;
-	}
-
-	if (lgr->role == SMC_SERV) {
-		smc_llc_prep_add_link(llc, link,
-				link->smcibdev->mac[link->ibport - 1],
-				link->gid, SMC_LLC_REQ);
-
-	} else {
-		smc_llc_prep_add_link(llc, link,
-				link->smcibdev->mac[link->ibport - 1],
-				link->gid, SMC_LLC_RESP);
-	}
-	smc_llc_send_message(link, llc);
-}
-
 static void smc_llc_rx_delete_link(struct smc_link *link,
 				   struct smc_llc_msg_del_link *llc)
 {
@@ -657,6 +616,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 {
 	union smc_llc_msg *llc = &qentry->msg;
 	struct smc_link *link = qentry->link;
+	struct smc_link_group *lgr = link->lgr;
 
 	if (!smc_link_usable(link))
 		goto out;
@@ -665,11 +625,31 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 	case SMC_LLC_TEST_LINK:
 		smc_llc_rx_test_link(link, &llc->test_link);
 		break;
-	case SMC_LLC_CONFIRM_LINK:
-		smc_llc_rx_confirm_link(link, &llc->confirm_link);
-		break;
 	case SMC_LLC_ADD_LINK:
-		smc_llc_rx_add_link(link, &llc->add_link);
+		if (list_empty(&lgr->list))
+			goto out;	/* lgr is terminating */
+		if (lgr->role == SMC_CLNT) {
+			if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) {
+				/* a flow is waiting for this message */
+				smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+							qentry);
+				wake_up_interruptible(&lgr->llc_waiter);
+			} else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+						      qentry)) {
+				/* tbd: schedule_work(&lgr->llc_add_link_work); */
+			}
+		} else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+			/* as smc server, handle client suggestion */
+			/* tbd: schedule_work(&lgr->llc_add_link_work); */
+		}
+		return;
+	case SMC_LLC_CONFIRM_LINK:
+		if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+			/* a flow is waiting for this message */
+			smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+			wake_up_interruptible(&lgr->llc_waiter);
+			return;
+		}
 		break;
 	case SMC_LLC_DELETE_LINK:
 		smc_llc_rx_delete_link(link, &llc->delete_link);
@@ -857,8 +837,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 
 int smc_llc_link_init(struct smc_link *link)
 {
-	init_completion(&link->llc_confirm);
-	init_completion(&link->llc_add);
 	init_completion(&link->llc_confirm_rkey_resp);
 	init_completion(&link->llc_delete_rkey_resp);
 	mutex_init(&link->llc_delete_rkey_mutex);
-- 
cgit v1.2.3


From 3d88a21b0cb6a2661a567e57a431e5aa12ecb203 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:44 +0200
Subject: net/smc: multiple link support and LLC flow for
 smc_llc_do_confirm_rkey

Adapt smc_llc_do_confirm_rkey() to use the LLC flow and support the
rkeys of multiple links when the CONFIRM_RKEY LLC message is build.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h |  2 --
 net/smc/smc_llc.c  | 65 +++++++++++++++++++++++++++++++++++-------------------
 net/smc/smc_llc.h  |  2 +-
 3 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 31237c4c0d93..4e0dfb1d5804 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -123,8 +123,6 @@ struct smc_link {
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
-	struct completion	llc_confirm_rkey_resp; /* w4 rx of cnf rkey */
-	int			llc_confirm_rkey_resp_rc; /* rc from cnf rkey */
 	struct completion	llc_delete_rkey_resp; /* w4 rx of del rkey */
 	int			llc_delete_rkey_resp_rc; /* rc from del rkey */
 	struct mutex		llc_delete_rkey_mutex; /* serialize usage */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 644e9ab0dec5..5db11f54b4cd 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -369,27 +369,44 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 }
 
 /* send LLC confirm rkey request */
-static int smc_llc_send_confirm_rkey(struct smc_link *link,
+static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
 				     struct smc_buf_desc *rmb_desc)
 {
 	struct smc_llc_msg_confirm_rkey *rkeyllc;
 	struct smc_wr_tx_pend_priv *pend;
 	struct smc_wr_buf *wr_buf;
-	int rc;
+	struct smc_link *link;
+	int i, rc, rtok_ix;
 
-	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
 	if (rc)
 		return rc;
 	rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
 	memset(rkeyllc, 0, sizeof(*rkeyllc));
 	rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
 	rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+
+	rtok_ix = 1;
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		link = &send_link->lgr->lnk[i];
+		if (link->state == SMC_LNK_ACTIVE && link != send_link) {
+			rkeyllc->rtoken[rtok_ix].link_id = link->link_id;
+			rkeyllc->rtoken[rtok_ix].rmb_key =
+				htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
+			rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64(
+				(u64)sg_dma_address(
+					rmb_desc->sgt[link->link_idx].sgl));
+			rtok_ix++;
+		}
+	}
+	/* rkey of send_link is in rtoken[0] */
+	rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1;
 	rkeyllc->rtoken[0].rmb_key =
-		htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
+		htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey);
 	rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
-		(u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl));
+		(u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl));
 	/* send llc message */
-	rc = smc_wr_tx_send(link, pend);
+	rc = smc_wr_tx_send(send_link, pend);
 	return rc;
 }
 
@@ -712,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 		break;
 	case SMC_LLC_ADD_LINK:
 	case SMC_LLC_CONFIRM_LINK:
+	case SMC_LLC_CONFIRM_RKEY:
 		/* assign responses to the local flow, we requested them */
 		smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
 		wake_up_interruptible(&link->lgr->llc_waiter);
@@ -720,11 +738,6 @@ static void smc_llc_rx_response(struct smc_link *link,
 		if (link->lgr->role == SMC_SERV)
 			smc_lgr_schedule_free_work_fast(link->lgr);
 		break;
-	case SMC_LLC_CONFIRM_RKEY:
-		link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags &
-						 SMC_LLC_FLAG_RKEY_NEG;
-		complete(&link->llc_confirm_rkey_resp);
-		break;
 	case SMC_LLC_CONFIRM_RKEY_CONT:
 		/* unused as long as we don't send this type of msg */
 		break;
@@ -837,7 +850,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 
 int smc_llc_link_init(struct smc_link *link)
 {
-	init_completion(&link->llc_confirm_rkey_resp);
 	init_completion(&link->llc_delete_rkey_resp);
 	mutex_init(&link->llc_delete_rkey_mutex);
 	init_completion(&link->llc_testlink_resp);
@@ -870,23 +882,30 @@ void smc_llc_link_clear(struct smc_link *link)
 	smc_wr_wakeup_tx_wait(link);
 }
 
-/* register a new rtoken at the remote peer */
-int smc_llc_do_confirm_rkey(struct smc_link *link,
+/* register a new rtoken at the remote peer (for all links) */
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 			    struct smc_buf_desc *rmb_desc)
 {
-	int rc;
+	struct smc_link_group *lgr = send_link->lgr;
+	struct smc_llc_qentry *qentry = NULL;
+	int rc = 0;
 
-	/* protected by mutex smc_create_lgr_pending */
-	reinit_completion(&link->llc_confirm_rkey_resp);
-	rc = smc_llc_send_confirm_rkey(link, rmb_desc);
+	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
 	if (rc)
 		return rc;
+	rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
+	if (rc)
+		goto out;
 	/* receive CONFIRM RKEY response from server over RoCE fabric */
-	rc = wait_for_completion_interruptible_timeout(
-			&link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME);
-	if (rc <= 0 || link->llc_confirm_rkey_resp_rc)
-		return -EFAULT;
-	return 0;
+	qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_CONFIRM_RKEY);
+	if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+		rc = -EFAULT;
+out:
+	if (qentry)
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+	return rc;
 }
 
 /* unregister an rtoken at the remote peer */
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 637acf91ffb7..d82d8346b61e 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -59,7 +59,7 @@ int smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link);
 void smc_llc_link_deleting(struct smc_link *link);
 void smc_llc_link_clear(struct smc_link *link);
-int smc_llc_do_confirm_rkey(struct smc_link *link,
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_do_delete_rkey(struct smc_link *link,
 			   struct smc_buf_desc *rmb_desc);
-- 
cgit v1.2.3


From 6d74c3a8a3e7a488a7d9d8c4a59091ccae72fc4c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:45 +0200
Subject: net/smc: multiple link support and LLC flow for
 smc_llc_do_delete_rkey

Adapt smc_llc_do_delete_rkey() to use the LLC flow and support multiple
links when deleting the rkeys for rmb buffers at the peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 10 ++++------
 net/smc/smc_core.h |  3 ---
 net/smc/smc_llc.c  | 39 +++++++++++++++++++--------------------
 net/smc/smc_llc.h  |  2 +-
 4 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4867ddcfe0c6..f71a366ed6ac 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -446,13 +446,11 @@ out:
 }
 
 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
-			   struct smc_link *lnk)
+			   struct smc_link_group *lgr)
 {
-	struct smc_link_group *lgr = lnk->lgr;
-
 	if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
 		/* unregister rmb with peer */
-		smc_llc_do_delete_rkey(lnk, rmb_desc);
+		smc_llc_do_delete_rkey(lgr, rmb_desc);
 		rmb_desc->is_conf_rkey = false;
 	}
 	if (rmb_desc->is_reg_err) {
@@ -475,7 +473,7 @@ static void smc_buf_unuse(struct smc_connection *conn,
 	if (conn->rmb_desc && lgr->is_smcd)
 		conn->rmb_desc->used = 0;
 	else if (conn->rmb_desc)
-		smcr_buf_unuse(conn->rmb_desc, conn->lnk);
+		smcr_buf_unuse(conn->rmb_desc, lgr);
 }
 
 /* remove a finished connection from its link group */
@@ -1169,7 +1167,6 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 		if (!smc_link_usable(lnk))
 			continue;
 		if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
-			smcr_buf_unuse(buf_desc, lnk);
 			rc = -ENOMEM;
 			goto out;
 		}
@@ -1275,6 +1272,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 	if (!is_smcd) {
 		if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
+			smcr_buf_unuse(buf_desc, lgr);
 			return -ENOMEM;
 		}
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 4e0dfb1d5804..364a54e28d61 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -123,9 +123,6 @@ struct smc_link {
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
-	struct completion	llc_delete_rkey_resp; /* w4 rx of del rkey */
-	int			llc_delete_rkey_resp_rc; /* rc from del rkey */
-	struct mutex		llc_delete_rkey_mutex; /* serialize usage */
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5db11f54b4cd..f9ec270818fa 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -720,7 +720,6 @@ static void smc_llc_rx_response(struct smc_link *link,
 				struct smc_llc_qentry *qentry)
 {
 	u8 llc_type = qentry->msg.raw.hdr.common.type;
-	union smc_llc_msg *llc = &qentry->msg;
 
 	switch (llc_type) {
 	case SMC_LLC_TEST_LINK:
@@ -730,6 +729,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 	case SMC_LLC_ADD_LINK:
 	case SMC_LLC_CONFIRM_LINK:
 	case SMC_LLC_CONFIRM_RKEY:
+	case SMC_LLC_DELETE_RKEY:
 		/* assign responses to the local flow, we requested them */
 		smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
 		wake_up_interruptible(&link->lgr->llc_waiter);
@@ -741,11 +741,6 @@ static void smc_llc_rx_response(struct smc_link *link,
 	case SMC_LLC_CONFIRM_RKEY_CONT:
 		/* unused as long as we don't send this type of msg */
 		break;
-	case SMC_LLC_DELETE_RKEY:
-		link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags &
-						SMC_LLC_FLAG_RKEY_NEG;
-		complete(&link->llc_delete_rkey_resp);
-		break;
 	}
 	kfree(qentry);
 }
@@ -850,8 +845,6 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 
 int smc_llc_link_init(struct smc_link *link)
 {
-	init_completion(&link->llc_delete_rkey_resp);
-	mutex_init(&link->llc_delete_rkey_mutex);
 	init_completion(&link->llc_testlink_resp);
 	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
 	return 0;
@@ -909,27 +902,33 @@ out:
 }
 
 /* unregister an rtoken at the remote peer */
-int smc_llc_do_delete_rkey(struct smc_link *link,
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
 			   struct smc_buf_desc *rmb_desc)
 {
+	struct smc_llc_qentry *qentry = NULL;
+	struct smc_link *send_link;
 	int rc = 0;
 
-	mutex_lock(&link->llc_delete_rkey_mutex);
-	if (link->state != SMC_LNK_ACTIVE)
-		goto out;
-	reinit_completion(&link->llc_delete_rkey_resp);
-	rc = smc_llc_send_delete_rkey(link, rmb_desc);
+	send_link = smc_llc_usable_link(lgr);
+	if (!send_link)
+		return -ENOLINK;
+
+	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+	if (rc)
+		return rc;
+	/* protected by llc_flow control */
+	rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
 	if (rc)
 		goto out;
 	/* receive DELETE RKEY response from server over RoCE fabric */
-	rc = wait_for_completion_interruptible_timeout(
-			&link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME);
-	if (rc <= 0 || link->llc_delete_rkey_resp_rc)
+	qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_DELETE_RKEY);
+	if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
 		rc = -EFAULT;
-	else
-		rc = 0;
 out:
-	mutex_unlock(&link->llc_delete_rkey_mutex);
+	if (qentry)
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 	return rc;
 }
 
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index d82d8346b61e..e9f23affece6 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -61,7 +61,7 @@ void smc_llc_link_deleting(struct smc_link *link);
 void smc_llc_link_clear(struct smc_link *link);
 int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 			    struct smc_buf_desc *rmb_desc);
-int smc_llc_do_delete_rkey(struct smc_link *link,
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
 			   struct smc_buf_desc *rmb_desc);
 int smc_llc_flow_initiate(struct smc_link_group *lgr,
 			  enum smc_llc_flowtype type);
-- 
cgit v1.2.3


From 56e8091c7a098ef2257f85f16665d79cf3049da9 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:46 +0200
Subject: net/smc: move the TEST_LINK response processing into event handler

Get rid of the extra function and move the two-liner for the TEST_LINK
response processing into the event handler function.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index f9ec270818fa..4945abbad111 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -563,13 +563,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 	smc_lgr_terminate_sched(lgr);
 }
 
-static void smc_llc_rx_test_link(struct smc_link *link,
-				 struct smc_llc_msg_test_link *llc)
-{
-	llc->hd.flags |= SMC_LLC_FLAG_RESP;
-	smc_llc_send_message(link, llc);
-}
-
 static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 				    struct smc_llc_msg_confirm_rkey *llc)
 {
@@ -640,7 +633,8 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 
 	switch (llc->raw.hdr.common.type) {
 	case SMC_LLC_TEST_LINK:
-		smc_llc_rx_test_link(link, &llc->test_link);
+		llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
+		smc_llc_send_message(link, llc);
 		break;
 	case SMC_LLC_ADD_LINK:
 		if (list_empty(&lgr->list))
-- 
cgit v1.2.3


From ba21abd22f9ffa023921923a6c01d28b59731ff8 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:47 +0200
Subject: net/smc: new smc_rtoken_set functions for multiple link support

Introduce smc_rtoken_set() to set the rtoken for a new link to an
existing rmb whose rtoken is given, and smc_rtoken_set2() to set an
rtoken for a new link whose link_id is given.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.h |  4 ++++
 2 files changed, 51 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f71a366ed6ac..096dce92ee2b 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1368,6 +1368,53 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
 	return -ENOSPC;
 }
 
+static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
+				   u32 rkey)
+{
+	int i;
+
+	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+		if (test_bit(i, lgr->rtokens_used_mask) &&
+		    lgr->rtokens[i][lnk_idx].rkey == rkey)
+			return i;
+	}
+	return -ENOENT;
+}
+
+/* set rtoken for a new link to an existing rmb */
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+		    __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
+{
+	int rtok_idx;
+
+	rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
+	if (rtok_idx == -ENOENT)
+		return;
+	lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
+	lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
+}
+
+/* set rtoken for a new link whose link_id is given */
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+		     __be64 nw_vaddr, __be32 nw_rkey)
+{
+	u64 dma_addr = be64_to_cpu(nw_vaddr);
+	u32 rkey = ntohl(nw_rkey);
+	bool found = false;
+	int link_idx;
+
+	for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
+		if (lgr->lnk[link_idx].link_id == link_id) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		return;
+	lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
+	lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
+}
+
 /* add a new rtoken from peer */
 int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
 {
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 364a54e28d61..0be26386057f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -352,6 +352,10 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
 			    struct smc_clc_msg_accept_confirm *clc);
 int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
 int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+		    __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+		     __be64 nw_vaddr, __be32 nw_rkey);
 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
-- 
cgit v1.2.3


From 3bc67e098c3e215f6e09ba3c0e1f569e7ae020d0 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:48 +0200
Subject: net/smc: adapt SMC remote CONFIRM_RKEY processing to use the LLC flow

Use the LLC flow framework for the processing of CONFIRM_RKEY messages
that were received from the peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 56 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 4945abbad111..b7b5cc01b78e 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -105,6 +105,7 @@ struct smc_llc_msg_confirm_rkey_cont {	/* type 0x08 */
 };
 
 #define SMC_LLC_DEL_RKEY_MAX	8
+#define SMC_LLC_FLAG_RKEY_RETRY	0x10
 #define SMC_LLC_FLAG_RKEY_NEG	0x20
 
 struct smc_llc_msg_delete_rkey {	/* type 0x09 */
@@ -563,21 +564,41 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 	smc_lgr_terminate_sched(lgr);
 }
 
-static void smc_llc_rx_confirm_rkey(struct smc_link *link,
-				    struct smc_llc_msg_confirm_rkey *llc)
+/* process a confirm_rkey request from peer, remote flow */
+static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
 {
-	int rc;
-
-	rc = smc_rtoken_add(link,
-			    llc->rtoken[0].rmb_vaddr,
-			    llc->rtoken[0].rmb_key);
-
-	/* ignore rtokens for other links, we have only one link */
-
+	struct smc_llc_msg_confirm_rkey *llc;
+	struct smc_llc_qentry *qentry;
+	struct smc_link *link;
+	int num_entries;
+	int rk_idx;
+	int i;
+
+	qentry = lgr->llc_flow_rmt.qentry;
+	llc = &qentry->msg.confirm_rkey;
+	link = qentry->link;
+
+	num_entries = llc->rtoken[0].num_rkeys;
+	/* first rkey entry is for receiving link */
+	rk_idx = smc_rtoken_add(link,
+				llc->rtoken[0].rmb_vaddr,
+				llc->rtoken[0].rmb_key);
+	if (rk_idx < 0)
+		goto out_err;
+
+	for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++)
+		smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id,
+				llc->rtoken[i].rmb_vaddr,
+				llc->rtoken[i].rmb_key);
+	/* max links is 3 so there is no need to support conf_rkey_cont msgs */
+	goto out;
+out_err:
+	llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+	llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
+out:
 	llc->hd.flags |= SMC_LLC_FLAG_RESP;
-	if (rc < 0)
-		llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
-	smc_llc_send_message(link, llc);
+	smc_llc_send_message(link, &qentry->msg);
+	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
 }
 
 static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
@@ -666,8 +687,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 		smc_llc_rx_delete_link(link, &llc->delete_link);
 		break;
 	case SMC_LLC_CONFIRM_RKEY:
-		smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey);
-		break;
+		/* new request from remote, assign to remote flow */
+		if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+			/* process here, does not wait for more llc msgs */
+			smc_llc_rmt_conf_rkey(lgr);
+			smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+		}
+		return;
 	case SMC_LLC_CONFIRM_RKEY_CONT:
 		smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont);
 		break;
-- 
cgit v1.2.3


From 218b24fe381238941a06496eaf221a22c5935267 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:49 +0200
Subject: net/smc: adapt SMC remote DELETE_RKEY processing to use the LLC flow

Use the LLC flow framework for the processing of DELETE_RKEY messages
that were received from the peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index b7b5cc01b78e..e458207bde9e 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -601,31 +601,37 @@ out:
 	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
 }
 
-static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
-				      struct smc_llc_msg_confirm_rkey_cont *llc)
-{
-	/* ignore rtokens for other links, we have only one link */
-	llc->hd.flags |= SMC_LLC_FLAG_RESP;
-	smc_llc_send_message(link, llc);
-}
-
-static void smc_llc_rx_delete_rkey(struct smc_link *link,
-				   struct smc_llc_msg_delete_rkey *llc)
+/* process a delete_rkey request from peer, remote flow */
+static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
 {
+	struct smc_llc_msg_delete_rkey *llc;
+	struct smc_llc_qentry *qentry;
+	struct smc_link *link;
 	u8 err_mask = 0;
 	int i, max;
 
+	qentry = lgr->llc_flow_rmt.qentry;
+	llc = &qentry->msg.delete_rkey;
+	link = qentry->link;
+
 	max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
 	for (i = 0; i < max; i++) {
 		if (smc_rtoken_delete(link, llc->rkey[i]))
 			err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
 	}
-
 	if (err_mask) {
 		llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
 		llc->err_mask = err_mask;
 	}
+	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_send_message(link, &qentry->msg);
+	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
 
+static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
+				      struct smc_llc_msg_confirm_rkey_cont *llc)
+{
+	/* ignore rtokens for other links, we have only one link */
 	llc->hd.flags |= SMC_LLC_FLAG_RESP;
 	smc_llc_send_message(link, llc);
 }
@@ -698,8 +704,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 		smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont);
 		break;
 	case SMC_LLC_DELETE_RKEY:
-		smc_llc_rx_delete_rkey(link, &llc->delete_rkey);
-		break;
+		/* new request from remote, assign to remote flow */
+		if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+			/* process here, does not wait for more llc msgs */
+			smc_llc_rmt_delete_rkey(lgr);
+			smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+		}
+		return;
 	}
 out:
 	kfree(qentry);
-- 
cgit v1.2.3


From 42d18acce9e29b61f5dbfc5118d7c72093e703a1 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:50 +0200
Subject: net/smc: remove handling of CONFIRM_RKEY_CONTINUE

The new SMC-R multiple link support will support a maximum of 3 links,
and one CONFIRM_RKEY LLC message can transport 3 rkeys of an rmb buffer.
There is no need for the LLC message type CONFIRM_RKEY_CONTINUE which is
needed when more than 3 rkeys per rmb buffer needs to be exchanged.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e458207bde9e..92c9a8a8aaf9 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -98,12 +98,6 @@ struct smc_llc_msg_confirm_rkey {	/* type 0x06 */
 	u8 reserved;
 };
 
-struct smc_llc_msg_confirm_rkey_cont {	/* type 0x08 */
-	struct smc_llc_hdr hd;
-	u8 num_rkeys;
-	struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
-};
-
 #define SMC_LLC_DEL_RKEY_MAX	8
 #define SMC_LLC_FLAG_RKEY_RETRY	0x10
 #define SMC_LLC_FLAG_RKEY_NEG	0x20
@@ -123,7 +117,6 @@ union smc_llc_msg {
 	struct smc_llc_msg_del_link delete_link;
 
 	struct smc_llc_msg_confirm_rkey confirm_rkey;
-	struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont;
 	struct smc_llc_msg_delete_rkey delete_rkey;
 
 	struct smc_llc_msg_test_link test_link;
@@ -628,14 +621,6 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
 	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
 }
 
-static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
-				      struct smc_llc_msg_confirm_rkey_cont *llc)
-{
-	/* ignore rtokens for other links, we have only one link */
-	llc->hd.flags |= SMC_LLC_FLAG_RESP;
-	smc_llc_send_message(link, llc);
-}
-
 /* flush the llc event queue */
 static void smc_llc_event_flush(struct smc_link_group *lgr)
 {
@@ -701,7 +686,9 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 		}
 		return;
 	case SMC_LLC_CONFIRM_RKEY_CONT:
-		smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont);
+		/* not used because max links is 3, and 3 rkeys fit into
+		 * one CONFIRM_RKEY message
+		 */
 		break;
 	case SMC_LLC_DELETE_RKEY:
 		/* new request from remote, assign to remote flow */
@@ -770,7 +757,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 			smc_lgr_schedule_free_work_fast(link->lgr);
 		break;
 	case SMC_LLC_CONFIRM_RKEY_CONT:
-		/* unused as long as we don't send this type of msg */
+		/* not used because max links is 3 */
 		break;
 	}
 	kfree(qentry);
-- 
cgit v1.2.3


From 41a211d862242439c9cdb2481946bb0928760541 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 30 Apr 2020 15:55:51 +0200
Subject: net/smc: remove obsolete link state DELETING

The connection layer in af_smc.c is now using the new LLC flow
framework, which made the link state DELETING obsolete. Remove the state
and the respective helpers.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 4 +---
 net/smc/smc_core.h | 1 -
 net/smc/smc_llc.c  | 7 -------
 net/smc/smc_llc.h  | 1 -
 4 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 096dce92ee2b..3539ceef9a97 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -200,7 +200,6 @@ static int smcr_link_send_delete(struct smc_link *lnk, bool orderly)
 {
 	if (lnk->state == SMC_LNK_ACTIVE &&
 	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
-		smc_llc_link_deleting(lnk);
 		return 0;
 	}
 	return -ENOTCONN;
@@ -767,8 +766,7 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 			continue;
 		/* tbd - terminate only when no more links are active */
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (!smc_link_usable(&lgr->lnk[i]) ||
-			    lgr->lnk[i].state == SMC_LNK_DELETING)
+			if (!smc_link_usable(&lgr->lnk[i]))
 				continue;
 			if (lgr->lnk[i].smcibdev == smcibdev &&
 			    lgr->lnk[i].ibport == ibport) {
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 0be26386057f..f12474cc666c 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -36,7 +36,6 @@ enum smc_link_state {			/* possible states of a link */
 	SMC_LNK_INACTIVE,	/* link is inactive */
 	SMC_LNK_ACTIVATING,	/* link is being activated */
 	SMC_LNK_ACTIVE,		/* link is active */
-	SMC_LNK_DELETING,	/* link is being deleted */
 };
 
 #define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 92c9a8a8aaf9..327cf30b98cc 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -545,7 +545,6 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 	struct smc_link_group *lgr = smc_get_lgr(link);
 
 	smc_lgr_forget(lgr);
-	smc_llc_link_deleting(link);
 	if (lgr->role == SMC_SERV) {
 		/* client asks to delete this link, send request */
 		smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
@@ -878,12 +877,6 @@ void smc_llc_link_active(struct smc_link *link)
 	}
 }
 
-void smc_llc_link_deleting(struct smc_link *link)
-{
-	link->state = SMC_LNK_DELETING;
-	smc_wr_wakeup_tx_wait(link);
-}
-
 /* called in worker context */
 void smc_llc_link_clear(struct smc_link *link)
 {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index e9f23affece6..48029a5e14c3 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -57,7 +57,6 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
 void smc_llc_lgr_clear(struct smc_link_group *lgr);
 int smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link);
-void smc_llc_link_deleting(struct smc_link *link);
 void smc_llc_link_clear(struct smc_link *link);
 int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 			    struct smc_buf_desc *rmb_desc);
-- 
cgit v1.2.3


From 6e3a401fc8af01828bcdc92d713195d318b36e7e Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Thu, 30 Apr 2020 18:51:14 +0300
Subject: inet_diag: add cgroup id attribute

This patch adds cgroup v2 ID to common inet diag message attributes.
Cgroup v2 ID is kernfs ID (ino or ino+gen). This attribute allows filter
inet diag output by cgroup ID obtained by name_to_handle_at() syscall.
When net_cls or net_prio cgroup is activated this ID is equal to 1 (root
cgroup ID) for newly created sockets.

Some notes about this ID:

1) gets initialized in socket() syscall
2) incoming socket gets ID from listening socket
   (not during accept() syscall)
3) not changed when process get moved to another cgroup
4) can point to deleted cgroup (refcounting)

v2:
  - use CONFIG_SOCK_CGROUP_DATA instead if CONFIG_CGROUPS

v3:
  - fix attr size by using nla_total_size_64bit() (Eric Dumazet)
  - more detailed commit message (Konstantin Khlebnikov)

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-By: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h      | 6 +++++-
 include/uapi/linux/inet_diag.h | 1 +
 net/ipv4/inet_diag.c           | 7 +++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index ce9ed1c0602f..0ef2d800fda7 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -71,7 +71,11 @@ static inline size_t inet_diag_msg_attrs_size(void)
 		+ nla_total_size(1)  /* INET_DIAG_SKV6ONLY */
 #endif
 		+ nla_total_size(4)  /* INET_DIAG_MARK */
-		+ nla_total_size(4); /* INET_DIAG_CLASS_ID */
+		+ nla_total_size(4)  /* INET_DIAG_CLASS_ID */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		+ nla_total_size_64bit(sizeof(u64))  /* INET_DIAG_CGROUP_ID */
+#endif
+		;
 }
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 57cc429a9177..c9b1e551792c 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -157,6 +157,7 @@ enum {
 	INET_DIAG_MD5SIG,
 	INET_DIAG_ULP_INFO,
 	INET_DIAG_SK_BPF_STORAGES,
+	INET_DIAG_CGROUP_ID,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5d50aad3cdbf..9c4c315cbc10 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -162,6 +162,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			goto errout;
 	}
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID,
+			      cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)),
+			      INET_DIAG_PAD))
+		goto errout;
+#endif
+
 	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
-- 
cgit v1.2.3


From b1f3e43dbfacfcd95296b0f80f84b186add9ef54 Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Thu, 30 Apr 2020 18:51:15 +0300
Subject: inet_diag: add support for cgroup filter

This patch adds ability to filter sockets based on cgroup v2 ID.
Such filter is helpful in ss utility for filtering sockets by
cgroup pathname.

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index c9b1e551792c..e6f183ee8417 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -96,6 +96,7 @@ enum {
 	INET_DIAG_BC_MARK_COND,
 	INET_DIAG_BC_S_EQ,
 	INET_DIAG_BC_D_EQ,
+	INET_DIAG_BC_CGROUP_COND,   /* u64 cgroup v2 ID */
 };
 
 struct inet_diag_hostcond {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 9c4c315cbc10..0034092358c3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -43,6 +43,9 @@ struct inet_diag_entry {
 	u16 userlocks;
 	u32 ifindex;
 	u32 mark;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	u64 cgroup_id;
+#endif
 };
 
 static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -682,6 +685,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 				yes = 0;
 			break;
 		}
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		case INET_DIAG_BC_CGROUP_COND: {
+			u64 cgroup_id;
+
+			cgroup_id = get_unaligned((const u64 *)(op + 1));
+			if (cgroup_id != entry->cgroup_id)
+				yes = 0;
+			break;
+		}
+#endif
 		}
 
 		if (yes) {
@@ -732,6 +745,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 		entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
 	else
 		entry.mark = 0;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data));
+#endif
 
 	return inet_diag_bc_run(bc, &entry);
 }
@@ -821,6 +837,15 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
 	return len >= *min_len;
 }
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
+			     int *min_len)
+{
+	*min_len += sizeof(u64);
+	return len >= *min_len;
+}
+#endif
+
 static int inet_diag_bc_audit(const struct nlattr *attr,
 			      const struct sk_buff *skb)
 {
@@ -863,6 +888,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
 			if (!valid_markcond(bc, len, &min_len))
 				return -EINVAL;
 			break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		case INET_DIAG_BC_CGROUP_COND:
+			if (!valid_cgroupcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
+#endif
 		case INET_DIAG_BC_AUTO:
 		case INET_DIAG_BC_JMP:
 		case INET_DIAG_BC_NOP:
-- 
cgit v1.2.3


From 40e79150c1686263e6a031d7702aec63aff31332 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:03:57 +0200
Subject: docs: networking: convert lapb-module.txt to ReST

- add SPDX header;
- adjust title markup;
- mark code blocks and literals as such;
- mark tables as such;
- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst       |   1 +
 Documentation/networking/lapb-module.rst | 305 +++++++++++++++++++++++++++++++
 Documentation/networking/lapb-module.txt | 263 --------------------------
 MAINTAINERS                              |   2 +-
 net/lapb/Kconfig                         |   2 +-
 5 files changed, 308 insertions(+), 265 deletions(-)
 create mode 100644 Documentation/networking/lapb-module.rst
 delete mode 100644 Documentation/networking/lapb-module.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 0c5d7a037983..acd2567cf0d4 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -75,6 +75,7 @@ Contents:
    ipvs-sysctl
    kcm
    l2tp
+   lapb-module
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/lapb-module.rst b/Documentation/networking/lapb-module.rst
new file mode 100644
index 000000000000..ff586bc9f005
--- /dev/null
+++ b/Documentation/networking/lapb-module.rst
@@ -0,0 +1,305 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================
+The Linux LAPB Module Interface
+===============================
+
+Version 1.3
+
+Jonathan Naylor 29.12.96
+
+Changed (Henner Eisen, 2000-10-29): int return value for data_indication()
+
+The LAPB module will be a separately compiled module for use by any parts of
+the Linux operating system that require a LAPB service. This document
+defines the interfaces to, and the services provided by this module. The
+term module in this context does not imply that the LAPB module is a
+separately loadable module, although it may be. The term module is used in
+its more standard meaning.
+
+The interface to the LAPB module consists of functions to the module,
+callbacks from the module to indicate important state changes, and
+structures for getting and setting information about the module.
+
+Structures
+----------
+
+Probably the most important structure is the skbuff structure for holding
+received and transmitted data, however it is beyond the scope of this
+document.
+
+The two LAPB specific structures are the LAPB initialisation structure and
+the LAPB parameter structure. These will be defined in a standard header
+file, <linux/lapb.h>. The header file <net/lapb.h> is internal to the LAPB
+module and is not for use.
+
+LAPB Initialisation Structure
+-----------------------------
+
+This structure is used only once, in the call to lapb_register (see below).
+It contains information about the device driver that requires the services
+of the LAPB module::
+
+	struct lapb_register_struct {
+		void (*connect_confirmation)(int token, int reason);
+		void (*connect_indication)(int token, int reason);
+		void (*disconnect_confirmation)(int token, int reason);
+		void (*disconnect_indication)(int token, int reason);
+		int  (*data_indication)(int token, struct sk_buff *skb);
+		void (*data_transmit)(int token, struct sk_buff *skb);
+	};
+
+Each member of this structure corresponds to a function in the device driver
+that is called when a particular event in the LAPB module occurs. These will
+be described in detail below. If a callback is not required (!!) then a NULL
+may be substituted.
+
+
+LAPB Parameter Structure
+------------------------
+
+This structure is used with the lapb_getparms and lapb_setparms functions
+(see below). They are used to allow the device driver to get and set the
+operational parameters of the LAPB implementation for a given connection::
+
+	struct lapb_parms_struct {
+		unsigned int t1;
+		unsigned int t1timer;
+		unsigned int t2;
+		unsigned int t2timer;
+		unsigned int n2;
+		unsigned int n2count;
+		unsigned int window;
+		unsigned int state;
+		unsigned int mode;
+	};
+
+T1 and T2 are protocol timing parameters and are given in units of 100ms. N2
+is the maximum number of tries on the link before it is declared a failure.
+The window size is the maximum number of outstanding data packets allowed to
+be unacknowledged by the remote end, the value of the window is between 1
+and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB
+link.
+
+The mode variable is a bit field used for setting (at present) three values.
+The bit fields have the following meanings:
+
+======  =================================================
+Bit	Meaning
+======  =================================================
+0	LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED).
+1	[SM]LP operation (0=LAPB_SLP 1=LAPB=MLP).
+2	DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE)
+3-31	Reserved, must be 0.
+======  =================================================
+
+Extended LAPB operation indicates the use of extended sequence numbers and
+consequently larger window sizes, the default is standard LAPB operation.
+MLP operation is the same as SLP operation except that the addresses used by
+LAPB are different to indicate the mode of operation, the default is Single
+Link Procedure. The difference between DCE and DTE operation is (i) the
+addresses used for commands and responses, and (ii) when the DCE is not
+connected, it sends DM without polls set, every T1. The upper case constant
+names will be defined in the public LAPB header file.
+
+
+Functions
+---------
+
+The LAPB module provides a number of function entry points.
+
+::
+
+    int lapb_register(void *token, struct lapb_register_struct);
+
+This must be called before the LAPB module may be used. If the call is
+successful then LAPB_OK is returned. The token must be a unique identifier
+generated by the device driver to allow for the unique identification of the
+instance of the LAPB link. It is returned by the LAPB module in all of the
+callbacks, and is used by the device driver in all calls to the LAPB module.
+For multiple LAPB links in a single device driver, multiple calls to
+lapb_register must be made. The format of the lapb_register_struct is given
+above. The return values are:
+
+=============		=============================
+LAPB_OK			LAPB registered successfully.
+LAPB_BADTOKEN		Token is already registered.
+LAPB_NOMEM		Out of memory
+=============		=============================
+
+::
+
+    int lapb_unregister(void *token);
+
+This releases all the resources associated with a LAPB link. Any current
+LAPB link will be abandoned without further messages being passed. After
+this call, the value of token is no longer valid for any calls to the LAPB
+function. The valid return values are:
+
+=============		===============================
+LAPB_OK			LAPB unregistered successfully.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+=============		===============================
+
+::
+
+    int lapb_getparms(void *token, struct lapb_parms_struct *parms);
+
+This allows the device driver to get the values of the current LAPB
+variables, the lapb_parms_struct is described above. The valid return values
+are:
+
+=============		=============================
+LAPB_OK			LAPB getparms was successful.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+=============		=============================
+
+::
+
+    int lapb_setparms(void *token, struct lapb_parms_struct *parms);
+
+This allows the device driver to set the values of the current LAPB
+variables, the lapb_parms_struct is described above. The values of t1timer,
+t2timer and n2count are ignored, likewise changing the mode bits when
+connected will be ignored. An error implies that none of the values have
+been changed. The valid return values are:
+
+=============		=================================================
+LAPB_OK			LAPB getparms was successful.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+LAPB_INVALUE		One of the values was out of its allowable range.
+=============		=================================================
+
+::
+
+    int lapb_connect_request(void *token);
+
+Initiate a connect using the current parameter settings. The valid return
+values are:
+
+==============		=================================
+LAPB_OK			LAPB is starting to connect.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+LAPB_CONNECTED		LAPB module is already connected.
+==============		=================================
+
+::
+
+    int lapb_disconnect_request(void *token);
+
+Initiate a disconnect. The valid return values are:
+
+=================	===============================
+LAPB_OK			LAPB is starting to disconnect.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+LAPB_NOTCONNECTED	LAPB module is not connected.
+=================	===============================
+
+::
+
+    int lapb_data_request(void *token, struct sk_buff *skb);
+
+Queue data with the LAPB module for transmitting over the link. If the call
+is successful then the skbuff is owned by the LAPB module and may not be
+used by the device driver again. The valid return values are:
+
+=================	=============================
+LAPB_OK			LAPB has accepted the data.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+LAPB_NOTCONNECTED	LAPB module is not connected.
+=================	=============================
+
+::
+
+    int lapb_data_received(void *token, struct sk_buff *skb);
+
+Queue data with the LAPB module which has been received from the device. It
+is expected that the data passed to the LAPB module has skb->data pointing
+to the beginning of the LAPB data. If the call is successful then the skbuff
+is owned by the LAPB module and may not be used by the device driver again.
+The valid return values are:
+
+=============		===========================
+LAPB_OK			LAPB has accepted the data.
+LAPB_BADTOKEN		Invalid/unknown LAPB token.
+=============		===========================
+
+Callbacks
+---------
+
+These callbacks are functions provided by the device driver for the LAPB
+module to call when an event occurs. They are registered with the LAPB
+module with lapb_register (see above) in the structure lapb_register_struct
+(see above).
+
+::
+
+    void (*connect_confirmation)(void *token, int reason);
+
+This is called by the LAPB module when a connection is established after
+being requested by a call to lapb_connect_request (see above). The reason is
+always LAPB_OK.
+
+::
+
+    void (*connect_indication)(void *token, int reason);
+
+This is called by the LAPB module when the link is established by the remote
+system. The value of reason is always LAPB_OK.
+
+::
+
+    void (*disconnect_confirmation)(void *token, int reason);
+
+This is called by the LAPB module when an event occurs after the device
+driver has called lapb_disconnect_request (see above). The reason indicates
+what has happened. In all cases the LAPB link can be regarded as being
+terminated. The values for reason are:
+
+=================	====================================================
+LAPB_OK			The LAPB link was terminated normally.
+LAPB_NOTCONNECTED	The remote system was not connected.
+LAPB_TIMEDOUT		No response was received in N2 tries from the remote
+			system.
+=================	====================================================
+
+::
+
+    void (*disconnect_indication)(void *token, int reason);
+
+This is called by the LAPB module when the link is terminated by the remote
+system or another event has occurred to terminate the link. This may be
+returned in response to a lapb_connect_request (see above) if the remote
+system refused the request. The values for reason are:
+
+=================	====================================================
+LAPB_OK			The LAPB link was terminated normally by the remote
+			system.
+LAPB_REFUSED		The remote system refused the connect request.
+LAPB_NOTCONNECTED	The remote system was not connected.
+LAPB_TIMEDOUT		No response was received in N2 tries from the remote
+			system.
+=================	====================================================
+
+::
+
+    int (*data_indication)(void *token, struct sk_buff *skb);
+
+This is called by the LAPB module when data has been received from the
+remote system that should be passed onto the next layer in the protocol
+stack. The skbuff becomes the property of the device driver and the LAPB
+module will not perform any more actions on it. The skb->data pointer will
+be pointing to the first byte of data after the LAPB header.
+
+This method should return NET_RX_DROP (as defined in the header
+file include/linux/netdevice.h) if and only if the frame was dropped
+before it could be delivered to the upper layer.
+
+::
+
+    void (*data_transmit)(void *token, struct sk_buff *skb);
+
+This is called by the LAPB module when data is to be transmitted to the
+remote system by the device driver. The skbuff becomes the property of the
+device driver and the LAPB module will not perform any more actions on it.
+The skb->data pointer will be pointing to the first byte of the LAPB header.
diff --git a/Documentation/networking/lapb-module.txt b/Documentation/networking/lapb-module.txt
deleted file mode 100644
index d4fc8f221559..000000000000
--- a/Documentation/networking/lapb-module.txt
+++ /dev/null
@@ -1,263 +0,0 @@
-		The Linux LAPB Module Interface 1.3
-
-		      Jonathan Naylor 29.12.96
-
-Changed (Henner Eisen, 2000-10-29): int return value for data_indication() 
-
-The LAPB module will be a separately compiled module for use by any parts of
-the Linux operating system that require a LAPB service. This document
-defines the interfaces to, and the services provided by this module. The
-term module in this context does not imply that the LAPB module is a
-separately loadable module, although it may be. The term module is used in
-its more standard meaning.
-
-The interface to the LAPB module consists of functions to the module,
-callbacks from the module to indicate important state changes, and
-structures for getting and setting information about the module.
-
-Structures
-----------
-
-Probably the most important structure is the skbuff structure for holding
-received and transmitted data, however it is beyond the scope of this
-document.
-
-The two LAPB specific structures are the LAPB initialisation structure and
-the LAPB parameter structure. These will be defined in a standard header
-file, <linux/lapb.h>. The header file <net/lapb.h> is internal to the LAPB
-module and is not for use.
-
-LAPB Initialisation Structure
------------------------------
-
-This structure is used only once, in the call to lapb_register (see below).
-It contains information about the device driver that requires the services
-of the LAPB module.
-
-struct lapb_register_struct {
-	void (*connect_confirmation)(int token, int reason);
-	void (*connect_indication)(int token, int reason);
-	void (*disconnect_confirmation)(int token, int reason);
-	void (*disconnect_indication)(int token, int reason);
-	int  (*data_indication)(int token, struct sk_buff *skb);
-	void (*data_transmit)(int token, struct sk_buff *skb);
-};
-
-Each member of this structure corresponds to a function in the device driver
-that is called when a particular event in the LAPB module occurs. These will
-be described in detail below. If a callback is not required (!!) then a NULL
-may be substituted.
-
-
-LAPB Parameter Structure
-------------------------
-
-This structure is used with the lapb_getparms and lapb_setparms functions
-(see below). They are used to allow the device driver to get and set the
-operational parameters of the LAPB implementation for a given connection.
-
-struct lapb_parms_struct {
-	unsigned int t1;
-	unsigned int t1timer;
-	unsigned int t2;
-	unsigned int t2timer;
-	unsigned int n2;
-	unsigned int n2count;
-	unsigned int window;
-	unsigned int state;
-	unsigned int mode;
-};
-
-T1 and T2 are protocol timing parameters and are given in units of 100ms. N2
-is the maximum number of tries on the link before it is declared a failure.
-The window size is the maximum number of outstanding data packets allowed to
-be unacknowledged by the remote end, the value of the window is between 1
-and 7 for a standard LAPB link, and between 1 and 127 for an extended LAPB
-link.
-
-The mode variable is a bit field used for setting (at present) three values.
-The bit fields have the following meanings:
-
-Bit	Meaning
-0	LAPB operation (0=LAPB_STANDARD 1=LAPB_EXTENDED).
-1	[SM]LP operation (0=LAPB_SLP 1=LAPB=MLP).
-2	DTE/DCE operation (0=LAPB_DTE 1=LAPB_DCE)
-3-31	Reserved, must be 0.
-
-Extended LAPB operation indicates the use of extended sequence numbers and
-consequently larger window sizes, the default is standard LAPB operation.
-MLP operation is the same as SLP operation except that the addresses used by
-LAPB are different to indicate the mode of operation, the default is Single
-Link Procedure. The difference between DCE and DTE operation is (i) the
-addresses used for commands and responses, and (ii) when the DCE is not
-connected, it sends DM without polls set, every T1. The upper case constant
-names will be defined in the public LAPB header file.
-
-
-Functions
----------
-
-The LAPB module provides a number of function entry points.
-
-
-int lapb_register(void *token, struct lapb_register_struct);
-
-This must be called before the LAPB module may be used. If the call is
-successful then LAPB_OK is returned. The token must be a unique identifier
-generated by the device driver to allow for the unique identification of the
-instance of the LAPB link. It is returned by the LAPB module in all of the
-callbacks, and is used by the device driver in all calls to the LAPB module.
-For multiple LAPB links in a single device driver, multiple calls to
-lapb_register must be made. The format of the lapb_register_struct is given
-above. The return values are:
-
-LAPB_OK			LAPB registered successfully.
-LAPB_BADTOKEN		Token is already registered.
-LAPB_NOMEM		Out of memory
-
-
-int lapb_unregister(void *token);
-
-This releases all the resources associated with a LAPB link. Any current
-LAPB link will be abandoned without further messages being passed. After
-this call, the value of token is no longer valid for any calls to the LAPB
-function. The valid return values are:
-
-LAPB_OK			LAPB unregistered successfully.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-
-
-int lapb_getparms(void *token, struct lapb_parms_struct *parms);
-
-This allows the device driver to get the values of the current LAPB
-variables, the lapb_parms_struct is described above. The valid return values
-are:
-
-LAPB_OK			LAPB getparms was successful.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-
-
-int lapb_setparms(void *token, struct lapb_parms_struct *parms);
-
-This allows the device driver to set the values of the current LAPB
-variables, the lapb_parms_struct is described above. The values of t1timer,
-t2timer and n2count are ignored, likewise changing the mode bits when
-connected will be ignored. An error implies that none of the values have
-been changed. The valid return values are:
-
-LAPB_OK			LAPB getparms was successful.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-LAPB_INVALUE		One of the values was out of its allowable range.
-
-
-int lapb_connect_request(void *token);
-
-Initiate a connect using the current parameter settings. The valid return
-values are:
-
-LAPB_OK			LAPB is starting to connect.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-LAPB_CONNECTED		LAPB module is already connected.
-
-
-int lapb_disconnect_request(void *token);
-
-Initiate a disconnect. The valid return values are:
-
-LAPB_OK			LAPB is starting to disconnect.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-LAPB_NOTCONNECTED	LAPB module is not connected.
-
-
-int lapb_data_request(void *token, struct sk_buff *skb);
-
-Queue data with the LAPB module for transmitting over the link. If the call
-is successful then the skbuff is owned by the LAPB module and may not be
-used by the device driver again. The valid return values are:
-
-LAPB_OK			LAPB has accepted the data.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-LAPB_NOTCONNECTED	LAPB module is not connected.
-
-
-int lapb_data_received(void *token, struct sk_buff *skb);
-
-Queue data with the LAPB module which has been received from the device. It
-is expected that the data passed to the LAPB module has skb->data pointing
-to the beginning of the LAPB data. If the call is successful then the skbuff
-is owned by the LAPB module and may not be used by the device driver again.
-The valid return values are:
-
-LAPB_OK			LAPB has accepted the data.
-LAPB_BADTOKEN		Invalid/unknown LAPB token.
-
-
-Callbacks
----------
-
-These callbacks are functions provided by the device driver for the LAPB
-module to call when an event occurs. They are registered with the LAPB
-module with lapb_register (see above) in the structure lapb_register_struct
-(see above).
-
-
-void (*connect_confirmation)(void *token, int reason);
-
-This is called by the LAPB module when a connection is established after
-being requested by a call to lapb_connect_request (see above). The reason is
-always LAPB_OK.
-
-
-void (*connect_indication)(void *token, int reason);
-
-This is called by the LAPB module when the link is established by the remote
-system. The value of reason is always LAPB_OK.
-
-
-void (*disconnect_confirmation)(void *token, int reason);
-
-This is called by the LAPB module when an event occurs after the device
-driver has called lapb_disconnect_request (see above). The reason indicates
-what has happened. In all cases the LAPB link can be regarded as being
-terminated. The values for reason are:
-
-LAPB_OK			The LAPB link was terminated normally.
-LAPB_NOTCONNECTED	The remote system was not connected.
-LAPB_TIMEDOUT		No response was received in N2 tries from the remote
-			system.
-
-
-void (*disconnect_indication)(void *token, int reason);
-
-This is called by the LAPB module when the link is terminated by the remote
-system or another event has occurred to terminate the link. This may be
-returned in response to a lapb_connect_request (see above) if the remote
-system refused the request. The values for reason are:
-
-LAPB_OK			The LAPB link was terminated normally by the remote
-			system.
-LAPB_REFUSED		The remote system refused the connect request.
-LAPB_NOTCONNECTED	The remote system was not connected.
-LAPB_TIMEDOUT		No response was received in N2 tries from the remote
-			system.
-
-
-int (*data_indication)(void *token, struct sk_buff *skb);
-
-This is called by the LAPB module when data has been received from the
-remote system that should be passed onto the next layer in the protocol
-stack. The skbuff becomes the property of the device driver and the LAPB
-module will not perform any more actions on it. The skb->data pointer will
-be pointing to the first byte of data after the LAPB header.
-
-This method should return NET_RX_DROP (as defined in the header
-file include/linux/netdevice.h) if and only if the frame was dropped
-before it could be delivered to the upper layer.
-
-
-void (*data_transmit)(void *token, struct sk_buff *skb);
-
-This is called by the LAPB module when data is to be transmitted to the
-remote system by the device driver. The skbuff becomes the property of the
-device driver and the LAPB module will not perform any more actions on it.
-The skb->data pointer will be pointing to the first byte of the LAPB header.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3a5f52a3c055..956999d2d979 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9515,7 +9515,7 @@ F:	drivers/soc/lantiq
 LAPB module
 L:	linux-x25@vger.kernel.org
 S:	Orphan
-F:	Documentation/networking/lapb-module.txt
+F:	Documentation/networking/lapb-module.rst
 F:	include/*/lapb.h
 F:	net/lapb/
 
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
index 6acfc999c952..5b50e8d64f26 100644
--- a/net/lapb/Kconfig
+++ b/net/lapb/Kconfig
@@ -15,7 +15,7 @@ config LAPB
 	  currently supports LAPB only over Ethernet connections. If you want
 	  to use LAPB connections over Ethernet, say Y here and to "LAPB over
 	  Ethernet driver" below. Read
-	  <file:Documentation/networking/lapb-module.txt> for technical
+	  <file:Documentation/networking/lapb-module.rst> for technical
 	  details.
 
 	  To compile this driver as a module, choose M here: the
-- 
cgit v1.2.3


From 429ff87bcac75b929d9ffec8d4d24be2616f8052 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:03:59 +0200
Subject: docs: networking: convert mac80211-injection.txt to ReST

- add SPDX header;
- adjust title markup;
- mark code blocks and literals as such;
- mark tables as such;
- adjust identation, whitespaces and blank lines;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst              |   1 +
 Documentation/networking/mac80211-injection.rst | 106 ++++++++++++++++++++++++
 Documentation/networking/mac80211-injection.txt |  97 ----------------------
 MAINTAINERS                                     |   2 +-
 net/mac80211/tx.c                               |   2 +-
 5 files changed, 109 insertions(+), 99 deletions(-)
 create mode 100644 Documentation/networking/mac80211-injection.rst
 delete mode 100644 Documentation/networking/mac80211-injection.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index b3608b177a8b..81c1834bfb57 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -77,6 +77,7 @@ Contents:
    l2tp
    lapb-module
    ltpc
+   mac80211-injection
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst
new file mode 100644
index 000000000000..75d4edcae852
--- /dev/null
+++ b/Documentation/networking/mac80211-injection.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+How to use packet injection with mac80211
+=========================================
+
+mac80211 now allows arbitrary packets to be injected down any Monitor Mode
+interface from userland.  The packet you inject needs to be composed in the
+following format::
+
+ [ radiotap header  ]
+ [ ieee80211 header ]
+ [ payload ]
+
+The radiotap format is discussed in
+./Documentation/networking/radiotap-headers.txt.
+
+Despite many radiotap parameters being currently defined, most only make sense
+to appear on received packets.  The following information is parsed from the
+radiotap headers and used to control injection:
+
+ * IEEE80211_RADIOTAP_FLAGS
+
+   =========================  ===========================================
+   IEEE80211_RADIOTAP_F_FCS   FCS will be removed and recalculated
+   IEEE80211_RADIOTAP_F_WEP   frame will be encrypted if key available
+   IEEE80211_RADIOTAP_F_FRAG  frame will be fragmented if longer than the
+			      current fragmentation threshold.
+   =========================  ===========================================
+
+ * IEEE80211_RADIOTAP_TX_FLAGS
+
+   =============================  ========================================
+   IEEE80211_RADIOTAP_F_TX_NOACK  frame should be sent without waiting for
+				  an ACK even if it is a unicast frame
+   =============================  ========================================
+
+ * IEEE80211_RADIOTAP_RATE
+
+   legacy rate for the transmission (only for devices without own rate control)
+
+ * IEEE80211_RADIOTAP_MCS
+
+   HT rate for the transmission (only for devices without own rate control).
+   Also some flags are parsed
+
+   ============================  ========================
+   IEEE80211_RADIOTAP_MCS_SGI    use short guard interval
+   IEEE80211_RADIOTAP_MCS_BW_40  send in HT40 mode
+   ============================  ========================
+
+ * IEEE80211_RADIOTAP_DATA_RETRIES
+
+   number of retries when either IEEE80211_RADIOTAP_RATE or
+   IEEE80211_RADIOTAP_MCS was used
+
+ * IEEE80211_RADIOTAP_VHT
+
+   VHT mcs and number of streams used in the transmission (only for devices
+   without own rate control). Also other fields are parsed
+
+   flags field
+	IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval
+
+   bandwidth field
+	* 1: send using 40MHz channel width
+	* 4: send using 80MHz channel width
+	* 11: send using 160MHz channel width
+
+The injection code can also skip all other currently defined radiotap fields
+facilitating replay of captured radiotap headers directly.
+
+Here is an example valid radiotap header defining some parameters::
+
+	0x00, 0x00, // <-- radiotap version
+	0x0b, 0x00, // <- radiotap header length
+	0x04, 0x0c, 0x00, 0x00, // <-- bitmap
+	0x6c, // <-- rate
+	0x0c, //<-- tx power
+	0x01 //<-- antenna
+
+The ieee80211 header follows immediately afterwards, looking for example like
+this::
+
+	0x08, 0x01, 0x00, 0x00,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
+	0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
+	0x10, 0x86
+
+Then lastly there is the payload.
+
+After composing the packet contents, it is sent by send()-ing it to a logical
+mac80211 interface that is in Monitor mode.  Libpcap can also be used,
+(which is easier than doing the work to bind the socket to the right
+interface), along the following lines:::
+
+	ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf);
+	...
+	r = pcap_inject(ppcap, u8aSendBuffer, nLength);
+
+You can also find a link to a complete inject application here:
+
+http://wireless.kernel.org/en/users/Documentation/packetspammer
+
+Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
deleted file mode 100644
index d58d78df9ca2..000000000000
--- a/Documentation/networking/mac80211-injection.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-How to use packet injection with mac80211
-=========================================
-
-mac80211 now allows arbitrary packets to be injected down any Monitor Mode
-interface from userland.  The packet you inject needs to be composed in the
-following format:
-
- [ radiotap header  ]
- [ ieee80211 header ]
- [ payload ]
-
-The radiotap format is discussed in
-./Documentation/networking/radiotap-headers.txt.
-
-Despite many radiotap parameters being currently defined, most only make sense
-to appear on received packets.  The following information is parsed from the
-radiotap headers and used to control injection:
-
- * IEEE80211_RADIOTAP_FLAGS
-
-   IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated
-   IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available
-   IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the
-			      current fragmentation threshold.
-
- * IEEE80211_RADIOTAP_TX_FLAGS
-
-   IEEE80211_RADIOTAP_F_TX_NOACK: frame should be sent without waiting for
-				  an ACK even if it is a unicast frame
-
- * IEEE80211_RADIOTAP_RATE
-
-   legacy rate for the transmission (only for devices without own rate control)
-
- * IEEE80211_RADIOTAP_MCS
-
-   HT rate for the transmission (only for devices without own rate control).
-   Also some flags are parsed
-
-   IEEE80211_RADIOTAP_MCS_SGI: use short guard interval
-   IEEE80211_RADIOTAP_MCS_BW_40: send in HT40 mode
-
- * IEEE80211_RADIOTAP_DATA_RETRIES
-
-   number of retries when either IEEE80211_RADIOTAP_RATE or
-   IEEE80211_RADIOTAP_MCS was used
-
- * IEEE80211_RADIOTAP_VHT
-
-   VHT mcs and number of streams used in the transmission (only for devices
-   without own rate control). Also other fields are parsed
-
-   flags field
-   IEEE80211_RADIOTAP_VHT_FLAG_SGI: use short guard interval
-
-   bandwidth field
-   1: send using 40MHz channel width
-   4: send using 80MHz channel width
-   11: send using 160MHz channel width
-
-The injection code can also skip all other currently defined radiotap fields
-facilitating replay of captured radiotap headers directly.
-
-Here is an example valid radiotap header defining some parameters
-
-	0x00, 0x00, // <-- radiotap version
-	0x0b, 0x00, // <- radiotap header length
-	0x04, 0x0c, 0x00, 0x00, // <-- bitmap
-	0x6c, // <-- rate
-	0x0c, //<-- tx power
-	0x01 //<-- antenna
-
-The ieee80211 header follows immediately afterwards, looking for example like
-this:
-
-	0x08, 0x01, 0x00, 0x00,
-	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-	0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
-	0x13, 0x22, 0x33, 0x44, 0x55, 0x66,
-	0x10, 0x86
-
-Then lastly there is the payload.
-
-After composing the packet contents, it is sent by send()-ing it to a logical
-mac80211 interface that is in Monitor mode.  Libpcap can also be used,
-(which is easier than doing the work to bind the socket to the right
-interface), along the following lines:
-
-	ppcap = pcap_open_live(szInterfaceName, 800, 1, 20, szErrbuf);
-...
-	r = pcap_inject(ppcap, u8aSendBuffer, nLength);
-
-You can also find a link to a complete inject application here:
-
-http://wireless.kernel.org/en/users/Documentation/packetspammer
-
-Andy Green <andy@warmcat.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 956999d2d979..33bfc9e4aead 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10079,7 +10079,7 @@ S:	Maintained
 W:	https://wireless.wiki.kernel.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git
-F:	Documentation/networking/mac80211-injection.txt
+F:	Documentation/networking/mac80211-injection.rst
 F:	Documentation/networking/mac80211_hwsim/mac80211_hwsim.rst
 F:	drivers/net/wireless/mac80211_hwsim.[ch]
 F:	include/net/mac80211.h
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 82846aca86d9..9849c14694db 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2144,7 +2144,7 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
 
 		/*
 		 * Please update the file
-		 * Documentation/networking/mac80211-injection.txt
+		 * Documentation/networking/mac80211-injection.rst
 		 * when parsing new fields here.
 		 */
 
-- 
cgit v1.2.3


From c1e4535f24bcfeef55a7ed409a5f50548e284426 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:04:13 +0200
Subject: docs: networking: convert pktgen.txt to ReST

- add SPDX header;
- adjust title markup;
- use bold markups on a few places;
- mark code blocks and literals as such;
- adjust identation, whitespaces and blank lines where needed;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst  |   1 +
 Documentation/networking/pktgen.rst | 412 ++++++++++++++++++++++++++++++++++++
 Documentation/networking/pktgen.txt | 400 ----------------------------------
 net/Kconfig                         |   2 +-
 net/core/pktgen.c                   |   2 +-
 samples/pktgen/README.rst           |   2 +-
 6 files changed, 416 insertions(+), 403 deletions(-)
 create mode 100644 Documentation/networking/pktgen.rst
 delete mode 100644 Documentation/networking/pktgen.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index e460026331c6..696181a96e3c 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -91,6 +91,7 @@ Contents:
    operstates
    packet_mmap
    phonet
+   pktgen
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/pktgen.rst b/Documentation/networking/pktgen.rst
new file mode 100644
index 000000000000..7afa1c9f1183
--- /dev/null
+++ b/Documentation/networking/pktgen.rst
@@ -0,0 +1,412 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================
+HOWTO for the linux packet generator
+====================================
+
+Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel
+or as a module.  A module is preferred; modprobe pktgen if needed.  Once
+running, pktgen creates a thread for each CPU with affinity to that CPU.
+Monitoring and controlling is done via /proc.  It is easiest to select a
+suitable sample script and configure that.
+
+On a dual CPU::
+
+    ps aux | grep pkt
+    root       129  0.3  0.0     0    0 ?        SW    2003 523:20 [kpktgend_0]
+    root       130  0.3  0.0     0    0 ?        SW    2003 509:50 [kpktgend_1]
+
+
+For monitoring and control pktgen creates::
+
+	/proc/net/pktgen/pgctrl
+	/proc/net/pktgen/kpktgend_X
+	/proc/net/pktgen/ethX
+
+
+Tuning NIC for max performance
+==============================
+
+The default NIC settings are (likely) not tuned for pktgen's artificial
+overload type of benchmarking, as this could hurt the normal use-case.
+
+Specifically increasing the TX ring buffer in the NIC::
+
+ # ethtool -G ethX tx 1024
+
+A larger TX ring can improve pktgen's performance, while it can hurt
+in the general case, 1) because the TX ring buffer might get larger
+than the CPU's L1/L2 cache, 2) because it allows more queueing in the
+NIC HW layer (which is bad for bufferbloat).
+
+One should hesitate to conclude that packets/descriptors in the HW
+TX ring cause delay.  Drivers usually delay cleaning up the
+ring-buffers for various performance reasons, and packets stalling
+the TX ring might just be waiting for cleanup.
+
+This cleanup issue is specifically the case for the driver ixgbe
+(Intel 82599 chip).  This driver (ixgbe) combines TX+RX ring cleanups,
+and the cleanup interval is affected by the ethtool --coalesce setting
+of parameter "rx-usecs".
+
+For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6)::
+
+ # ethtool -C ethX rx-usecs 30
+
+
+Kernel threads
+==============
+Pktgen creates a thread for each CPU with affinity to that CPU.
+Which is controlled through procfile /proc/net/pktgen/kpktgend_X.
+
+Example: /proc/net/pktgen/kpktgend_0::
+
+ Running:
+ Stopped: eth4@0
+ Result: OK: add_device=eth4@0
+
+Most important are the devices assigned to the thread.
+
+The two basic thread commands are:
+
+ * add_device DEVICE@NAME -- adds a single device
+ * rem_device_all         -- remove all associated devices
+
+When adding a device to a thread, a corresponding procfile is created
+which is used for configuring this device. Thus, device names need to
+be unique.
+
+To support adding the same device to multiple threads, which is useful
+with multi queue NICs, the device naming scheme is extended with "@":
+device@something
+
+The part after "@" can be anything, but it is custom to use the thread
+number.
+
+Viewing devices
+===============
+
+The Params section holds configured information.  The Current section
+holds running statistics.  The Result is printed after a run or after
+interruption.  Example::
+
+    /proc/net/pktgen/eth4@0
+
+    Params: count 100000  min_pkt_size: 60  max_pkt_size: 60
+	frags: 0  delay: 0  clone_skb: 64  ifname: eth4@0
+	flows: 0 flowlen: 0
+	queue_map_min: 0  queue_map_max: 0
+	dst_min: 192.168.81.2  dst_max:
+	src_min:   src_max:
+	src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8
+	udp_src_min: 9  udp_src_max: 109  udp_dst_min: 9  udp_dst_max: 9
+	src_mac_count: 0  dst_mac_count: 0
+	Flags: UDPSRC_RND  NO_TIMESTAMP  QUEUE_MAP_CPU
+    Current:
+	pkts-sofar: 100000  errors: 0
+	started: 623913381008us  stopped: 623913396439us idle: 25us
+	seq_num: 100001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
+	cur_saddr: 192.168.8.3  cur_daddr: 192.168.81.2
+	cur_udp_dst: 9  cur_udp_src: 42
+	cur_queue_map: 0
+	flows: 0
+    Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags)
+    6480562pps 3110Mb/sec (3110669760bps) errors: 0
+
+
+Configuring devices
+===================
+This is done via the /proc interface, and most easily done via pgset
+as defined in the sample scripts.
+You need to specify PGDEV environment variable to use functions from sample
+scripts, i.e.::
+
+    export PGDEV=/proc/net/pktgen/eth4@0
+    source samples/pktgen/functions.sh
+
+Examples::
+
+ pg_ctrl start           starts injection.
+ pg_ctrl stop            aborts injection. Also, ^C aborts generator.
+
+ pgset "clone_skb 1"     sets the number of copies of the same packet
+ pgset "clone_skb 0"     use single SKB for all transmits
+ pgset "burst 8"         uses xmit_more API to queue 8 copies of the same
+			 packet and update HW tx queue tail pointer once.
+			 "burst 1" is the default
+ pgset "pkt_size 9014"   sets packet size to 9014
+ pgset "frags 5"         packet will consist of 5 fragments
+ pgset "count 200000"    sets number of packets to send, set to zero
+			 for continuous sends until explicitly stopped.
+
+ pgset "delay 5000"      adds delay to hard_start_xmit(). nanoseconds
+
+ pgset "dst 10.0.0.1"    sets IP destination address
+			 (BEWARE! This generator is very aggressive!)
+
+ pgset "dst_min 10.0.0.1"            Same as dst
+ pgset "dst_max 10.0.0.254"          Set the maximum destination IP.
+ pgset "src_min 10.0.0.1"            Set the minimum (or only) source IP.
+ pgset "src_max 10.0.0.254"          Set the maximum source IP.
+ pgset "dst6 fec0::1"     IPV6 destination address
+ pgset "src6 fec0::2"     IPV6 source address
+ pgset "dstmac 00:00:00:00:00:00"    sets MAC destination address
+ pgset "srcmac 00:00:00:00:00:00"    sets MAC source address
+
+ pgset "queue_map_min 0" Sets the min value of tx queue interval
+ pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices
+			 To select queue 1 of a given device,
+			 use queue_map_min=1 and queue_map_max=1
+
+ pgset "src_mac_count 1" Sets the number of MACs we'll range through.
+			 The 'minimum' MAC is what you set with srcmac.
+
+ pgset "dst_mac_count 1" Sets the number of MACs we'll range through.
+			 The 'minimum' MAC is what you set with dstmac.
+
+ pgset "flag [name]"     Set a flag to determine behaviour.  Current flags
+			 are: IPSRC_RND # IP source is random (between min/max)
+			      IPDST_RND # IP destination is random
+			      UDPSRC_RND, UDPDST_RND,
+			      MACSRC_RND, MACDST_RND
+			      TXSIZE_RND, IPV6,
+			      MPLS_RND, VID_RND, SVID_RND
+			      FLOW_SEQ,
+			      QUEUE_MAP_RND # queue map random
+			      QUEUE_MAP_CPU # queue map mirrors smp_processor_id()
+			      UDPCSUM,
+			      IPSEC # IPsec encapsulation (needs CONFIG_XFRM)
+			      NODE_ALLOC # node specific memory allocation
+			      NO_TIMESTAMP # disable timestamping
+ pgset 'flag ![name]'    Clear a flag to determine behaviour.
+			 Note that you might need to use single quote in
+			 interactive mode, so that your shell wouldn't expand
+			 the specified flag as a history command.
+
+ pgset "spi [SPI_VALUE]" Set specific SA used to transform packet.
+
+ pgset "udp_src_min 9"   set UDP source port min, If < udp_src_max, then
+			 cycle through the port range.
+
+ pgset "udp_src_max 9"   set UDP source port max.
+ pgset "udp_dst_min 9"   set UDP destination port min, If < udp_dst_max, then
+			 cycle through the port range.
+ pgset "udp_dst_max 9"   set UDP destination port max.
+
+ pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example
+					 outer label=16,middle label=32,
+					 inner label=0 (IPv4 NULL)) Note that
+					 there must be no spaces between the
+					 arguments. Leading zeros are required.
+					 Do not set the bottom of stack bit,
+					 that's done automatically. If you do
+					 set the bottom of stack bit, that
+					 indicates that you want to randomly
+					 generate that address and the flag
+					 MPLS_RND will be turned on. You
+					 can have any mix of random and fixed
+					 labels in the label stack.
+
+ pgset "mpls 0"		  turn off mpls (or any invalid argument works too!)
+
+ pgset "vlan_id 77"       set VLAN ID 0-4095
+ pgset "vlan_p 3"         set priority bit 0-7 (default 0)
+ pgset "vlan_cfi 0"       set canonical format identifier 0-1 (default 0)
+
+ pgset "svlan_id 22"      set SVLAN ID 0-4095
+ pgset "svlan_p 3"        set priority bit 0-7 (default 0)
+ pgset "svlan_cfi 0"      set canonical format identifier 0-1 (default 0)
+
+ pgset "vlan_id 9999"     > 4095 remove vlan and svlan tags
+ pgset "svlan 9999"       > 4095 remove svlan tag
+
+
+ pgset "tos XX"           set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00)
+ pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00)
+
+ pgset "rate 300M"        set rate to 300 Mb/s
+ pgset "ratep 1000000"    set rate to 1Mpps
+
+ pgset "xmit_mode netif_receive"  RX inject into stack netif_receive_skb()
+				  Works with "burst" but not with "clone_skb".
+				  Default xmit_mode is "start_xmit".
+
+Sample scripts
+==============
+
+A collection of tutorial scripts and helpers for pktgen is in the
+samples/pktgen directory. The helper parameters.sh file support easy
+and consistent parameter parsing across the sample scripts.
+
+Usage example and help::
+
+ ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2
+
+Usage:::
+
+  ./pktgen_sample01_simple.sh [-vx] -i ethX
+
+  -i : ($DEV)       output interface/device (required)
+  -s : ($PKT_SIZE)  packet size
+  -d : ($DEST_IP)   destination IP
+  -m : ($DST_MAC)   destination MAC-addr
+  -t : ($THREADS)   threads to start
+  -c : ($SKB_CLONE) SKB clones send before alloc new SKB
+  -b : ($BURST)     HW level bursting of SKBs
+  -v : ($VERBOSE)   verbose
+  -x : ($DEBUG)     debug
+
+The global variables being set are also listed.  E.g. the required
+interface/device parameter "-i" sets variable $DEV.  Copy the
+pktgen_sampleXX scripts and modify them to fit your own needs.
+
+The old scripts::
+
+    pktgen.conf-1-2                  # 1 CPU 2 dev
+    pktgen.conf-1-1-rdos             # 1 CPU 1 dev w. route DoS
+    pktgen.conf-1-1-ip6              # 1 CPU 1 dev ipv6
+    pktgen.conf-1-1-ip6-rdos         # 1 CPU 1 dev ipv6  w. route DoS
+    pktgen.conf-1-1-flows            # 1 CPU 1 dev multiple flows.
+
+
+Interrupt affinity
+===================
+Note that when adding devices to a specific CPU it is a good idea to
+also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound
+to the same CPU.  This reduces cache bouncing when freeing skbs.
+
+Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue
+to the running threads CPU (directly from smp_processor_id()).
+
+Enable IPsec
+============
+Default IPsec transformation with ESP encapsulation plus transport mode
+can be enabled by simply setting::
+
+    pgset "flag IPSEC"
+    pgset "flows 1"
+
+To avoid breaking existing testbed scripts for using AH type and tunnel mode,
+you can use "pgset spi SPI_VALUE" to specify which transformation mode
+to employ.
+
+
+Current commands and configuration options
+==========================================
+
+**Pgcontrol commands**::
+
+    start
+    stop
+    reset
+
+**Thread commands**::
+
+    add_device
+    rem_device_all
+
+
+**Device commands**::
+
+    count
+    clone_skb
+    burst
+    debug
+
+    frags
+    delay
+
+    src_mac_count
+    dst_mac_count
+
+    pkt_size
+    min_pkt_size
+    max_pkt_size
+
+    queue_map_min
+    queue_map_max
+    skb_priority
+
+    tos           (ipv4)
+    traffic_class (ipv6)
+
+    mpls
+
+    udp_src_min
+    udp_src_max
+
+    udp_dst_min
+    udp_dst_max
+
+    node
+
+    flag
+    IPSRC_RND
+    IPDST_RND
+    UDPSRC_RND
+    UDPDST_RND
+    MACSRC_RND
+    MACDST_RND
+    TXSIZE_RND
+    IPV6
+    MPLS_RND
+    VID_RND
+    SVID_RND
+    FLOW_SEQ
+    QUEUE_MAP_RND
+    QUEUE_MAP_CPU
+    UDPCSUM
+    IPSEC
+    NODE_ALLOC
+    NO_TIMESTAMP
+
+    spi (ipsec)
+
+    dst_min
+    dst_max
+
+    src_min
+    src_max
+
+    dst_mac
+    src_mac
+
+    clear_counters
+
+    src6
+    dst6
+    dst6_max
+    dst6_min
+
+    flows
+    flowlen
+
+    rate
+    ratep
+
+    xmit_mode <start_xmit|netif_receive>
+
+    vlan_cfi
+    vlan_id
+    vlan_p
+
+    svlan_cfi
+    svlan_id
+    svlan_p
+
+
+References:
+
+- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
+- tp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
+
+Paper from Linux-Kongress in Erlangen 2004.
+- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf
+
+Thanks to:
+
+Grant Grundler for testing on IA-64 and parisc, Harald Welte,  Lennert Buytenhek
+Stephen Hemminger, Andi Kleen, Dave Miller and many others.
+
+
+Good luck with the linux net-development.
diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
deleted file mode 100644
index d2fd78f85aa4..000000000000
--- a/Documentation/networking/pktgen.txt
+++ /dev/null
@@ -1,400 +0,0 @@
-
-
-                  HOWTO for the linux packet generator
-                  ------------------------------------
-
-Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel
-or as a module.  A module is preferred; modprobe pktgen if needed.  Once
-running, pktgen creates a thread for each CPU with affinity to that CPU.
-Monitoring and controlling is done via /proc.  It is easiest to select a
-suitable sample script and configure that.
-
-On a dual CPU:
-
-ps aux | grep pkt
-root       129  0.3  0.0     0    0 ?        SW    2003 523:20 [kpktgend_0]
-root       130  0.3  0.0     0    0 ?        SW    2003 509:50 [kpktgend_1]
-
-
-For monitoring and control pktgen creates:
-	/proc/net/pktgen/pgctrl
-	/proc/net/pktgen/kpktgend_X
-        /proc/net/pktgen/ethX
-
-
-Tuning NIC for max performance
-==============================
-
-The default NIC settings are (likely) not tuned for pktgen's artificial
-overload type of benchmarking, as this could hurt the normal use-case.
-
-Specifically increasing the TX ring buffer in the NIC:
- # ethtool -G ethX tx 1024
-
-A larger TX ring can improve pktgen's performance, while it can hurt
-in the general case, 1) because the TX ring buffer might get larger
-than the CPU's L1/L2 cache, 2) because it allows more queueing in the
-NIC HW layer (which is bad for bufferbloat).
-
-One should hesitate to conclude that packets/descriptors in the HW
-TX ring cause delay.  Drivers usually delay cleaning up the
-ring-buffers for various performance reasons, and packets stalling
-the TX ring might just be waiting for cleanup.
-
-This cleanup issue is specifically the case for the driver ixgbe
-(Intel 82599 chip).  This driver (ixgbe) combines TX+RX ring cleanups,
-and the cleanup interval is affected by the ethtool --coalesce setting
-of parameter "rx-usecs".
-
-For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6):
- # ethtool -C ethX rx-usecs 30
-
-
-Kernel threads
-==============
-Pktgen creates a thread for each CPU with affinity to that CPU.
-Which is controlled through procfile /proc/net/pktgen/kpktgend_X.
-
-Example: /proc/net/pktgen/kpktgend_0
-
- Running:
- Stopped: eth4@0
- Result: OK: add_device=eth4@0
-
-Most important are the devices assigned to the thread.
-
-The two basic thread commands are:
- * add_device DEVICE@NAME -- adds a single device
- * rem_device_all         -- remove all associated devices
-
-When adding a device to a thread, a corresponding procfile is created
-which is used for configuring this device. Thus, device names need to
-be unique.
-
-To support adding the same device to multiple threads, which is useful
-with multi queue NICs, the device naming scheme is extended with "@":
- device@something
-
-The part after "@" can be anything, but it is custom to use the thread
-number.
-
-Viewing devices
-===============
-
-The Params section holds configured information.  The Current section
-holds running statistics.  The Result is printed after a run or after
-interruption.  Example:
-
-/proc/net/pktgen/eth4@0
-
- Params: count 100000  min_pkt_size: 60  max_pkt_size: 60
-     frags: 0  delay: 0  clone_skb: 64  ifname: eth4@0
-     flows: 0 flowlen: 0
-     queue_map_min: 0  queue_map_max: 0
-     dst_min: 192.168.81.2  dst_max:
-     src_min:   src_max:
-     src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8
-     udp_src_min: 9  udp_src_max: 109  udp_dst_min: 9  udp_dst_max: 9
-     src_mac_count: 0  dst_mac_count: 0
-     Flags: UDPSRC_RND  NO_TIMESTAMP  QUEUE_MAP_CPU
- Current:
-     pkts-sofar: 100000  errors: 0
-     started: 623913381008us  stopped: 623913396439us idle: 25us
-     seq_num: 100001  cur_dst_mac_offset: 0  cur_src_mac_offset: 0
-     cur_saddr: 192.168.8.3  cur_daddr: 192.168.81.2
-     cur_udp_dst: 9  cur_udp_src: 42
-     cur_queue_map: 0
-     flows: 0
- Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags)
-  6480562pps 3110Mb/sec (3110669760bps) errors: 0
-
-
-Configuring devices
-===================
-This is done via the /proc interface, and most easily done via pgset
-as defined in the sample scripts.
-You need to specify PGDEV environment variable to use functions from sample
-scripts, i.e.:
-export PGDEV=/proc/net/pktgen/eth4@0
-source samples/pktgen/functions.sh
-
-Examples:
-
- pg_ctrl start           starts injection.
- pg_ctrl stop            aborts injection. Also, ^C aborts generator.
-
- pgset "clone_skb 1"     sets the number of copies of the same packet
- pgset "clone_skb 0"     use single SKB for all transmits
- pgset "burst 8"         uses xmit_more API to queue 8 copies of the same
-                         packet and update HW tx queue tail pointer once.
-                         "burst 1" is the default
- pgset "pkt_size 9014"   sets packet size to 9014
- pgset "frags 5"         packet will consist of 5 fragments
- pgset "count 200000"    sets number of packets to send, set to zero
-                         for continuous sends until explicitly stopped.
-
- pgset "delay 5000"      adds delay to hard_start_xmit(). nanoseconds
-
- pgset "dst 10.0.0.1"    sets IP destination address
-                         (BEWARE! This generator is very aggressive!)
-
- pgset "dst_min 10.0.0.1"            Same as dst
- pgset "dst_max 10.0.0.254"          Set the maximum destination IP.
- pgset "src_min 10.0.0.1"            Set the minimum (or only) source IP.
- pgset "src_max 10.0.0.254"          Set the maximum source IP.
- pgset "dst6 fec0::1"     IPV6 destination address
- pgset "src6 fec0::2"     IPV6 source address
- pgset "dstmac 00:00:00:00:00:00"    sets MAC destination address
- pgset "srcmac 00:00:00:00:00:00"    sets MAC source address
-
- pgset "queue_map_min 0" Sets the min value of tx queue interval
- pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices
-                         To select queue 1 of a given device,
-                         use queue_map_min=1 and queue_map_max=1
-
- pgset "src_mac_count 1" Sets the number of MACs we'll range through.
-                         The 'minimum' MAC is what you set with srcmac.
-
- pgset "dst_mac_count 1" Sets the number of MACs we'll range through.
-                         The 'minimum' MAC is what you set with dstmac.
-
- pgset "flag [name]"     Set a flag to determine behaviour.  Current flags
-                         are: IPSRC_RND # IP source is random (between min/max)
-                              IPDST_RND # IP destination is random
-                              UDPSRC_RND, UDPDST_RND,
-                              MACSRC_RND, MACDST_RND
-                              TXSIZE_RND, IPV6,
-                              MPLS_RND, VID_RND, SVID_RND
-                              FLOW_SEQ,
-                              QUEUE_MAP_RND # queue map random
-                              QUEUE_MAP_CPU # queue map mirrors smp_processor_id()
-                              UDPCSUM,
-                              IPSEC # IPsec encapsulation (needs CONFIG_XFRM)
-                              NODE_ALLOC # node specific memory allocation
-                              NO_TIMESTAMP # disable timestamping
- pgset 'flag ![name]'    Clear a flag to determine behaviour.
-                         Note that you might need to use single quote in
-                         interactive mode, so that your shell wouldn't expand
-                         the specified flag as a history command.
-
- pgset "spi [SPI_VALUE]" Set specific SA used to transform packet.
-
- pgset "udp_src_min 9"   set UDP source port min, If < udp_src_max, then
-                         cycle through the port range.
-
- pgset "udp_src_max 9"   set UDP source port max.
- pgset "udp_dst_min 9"   set UDP destination port min, If < udp_dst_max, then
-                         cycle through the port range.
- pgset "udp_dst_max 9"   set UDP destination port max.
-
- pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example
-                                         outer label=16,middle label=32,
-					 inner label=0 (IPv4 NULL)) Note that
-					 there must be no spaces between the
-					 arguments. Leading zeros are required.
-					 Do not set the bottom of stack bit,
-					 that's done automatically. If you do
-					 set the bottom of stack bit, that
-					 indicates that you want to randomly
-					 generate that address and the flag
-					 MPLS_RND will be turned on. You
-					 can have any mix of random and fixed
-					 labels in the label stack.
-
- pgset "mpls 0"		  turn off mpls (or any invalid argument works too!)
-
- pgset "vlan_id 77"       set VLAN ID 0-4095
- pgset "vlan_p 3"         set priority bit 0-7 (default 0)
- pgset "vlan_cfi 0"       set canonical format identifier 0-1 (default 0)
-
- pgset "svlan_id 22"      set SVLAN ID 0-4095
- pgset "svlan_p 3"        set priority bit 0-7 (default 0)
- pgset "svlan_cfi 0"      set canonical format identifier 0-1 (default 0)
-
- pgset "vlan_id 9999"     > 4095 remove vlan and svlan tags
- pgset "svlan 9999"       > 4095 remove svlan tag
-
-
- pgset "tos XX"           set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00)
- pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00)
-
- pgset "rate 300M"        set rate to 300 Mb/s
- pgset "ratep 1000000"    set rate to 1Mpps
-
- pgset "xmit_mode netif_receive"  RX inject into stack netif_receive_skb()
-				  Works with "burst" but not with "clone_skb".
-				  Default xmit_mode is "start_xmit".
-
-Sample scripts
-==============
-
-A collection of tutorial scripts and helpers for pktgen is in the
-samples/pktgen directory. The helper parameters.sh file support easy
-and consistent parameter parsing across the sample scripts.
-
-Usage example and help:
- ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2
-
-Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX
-  -i : ($DEV)       output interface/device (required)
-  -s : ($PKT_SIZE)  packet size
-  -d : ($DEST_IP)   destination IP
-  -m : ($DST_MAC)   destination MAC-addr
-  -t : ($THREADS)   threads to start
-  -c : ($SKB_CLONE) SKB clones send before alloc new SKB
-  -b : ($BURST)     HW level bursting of SKBs
-  -v : ($VERBOSE)   verbose
-  -x : ($DEBUG)     debug
-
-The global variables being set are also listed.  E.g. the required
-interface/device parameter "-i" sets variable $DEV.  Copy the
-pktgen_sampleXX scripts and modify them to fit your own needs.
-
-The old scripts:
-
-pktgen.conf-1-2                  # 1 CPU 2 dev
-pktgen.conf-1-1-rdos             # 1 CPU 1 dev w. route DoS 
-pktgen.conf-1-1-ip6              # 1 CPU 1 dev ipv6
-pktgen.conf-1-1-ip6-rdos         # 1 CPU 1 dev ipv6  w. route DoS
-pktgen.conf-1-1-flows            # 1 CPU 1 dev multiple flows.
-
-
-Interrupt affinity
-===================
-Note that when adding devices to a specific CPU it is a good idea to
-also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound
-to the same CPU.  This reduces cache bouncing when freeing skbs.
-
-Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue
-to the running threads CPU (directly from smp_processor_id()).
-
-Enable IPsec
-============
-Default IPsec transformation with ESP encapsulation plus transport mode
-can be enabled by simply setting:
-
-pgset "flag IPSEC"
-pgset "flows 1"
-
-To avoid breaking existing testbed scripts for using AH type and tunnel mode,
-you can use "pgset spi SPI_VALUE" to specify which transformation mode
-to employ.
-
-
-Current commands and configuration options
-==========================================
-
-** Pgcontrol commands:
-
-start
-stop
-reset
-
-** Thread commands:
-
-add_device
-rem_device_all
-
-
-** Device commands:
-
-count
-clone_skb
-burst
-debug
-
-frags
-delay
-
-src_mac_count
-dst_mac_count
-
-pkt_size
-min_pkt_size
-max_pkt_size
-
-queue_map_min
-queue_map_max
-skb_priority
-
-tos           (ipv4)
-traffic_class (ipv6)
-
-mpls
-
-udp_src_min
-udp_src_max
-
-udp_dst_min
-udp_dst_max
-
-node
-
-flag
-  IPSRC_RND
-  IPDST_RND
-  UDPSRC_RND
-  UDPDST_RND
-  MACSRC_RND
-  MACDST_RND
-  TXSIZE_RND
-  IPV6
-  MPLS_RND
-  VID_RND
-  SVID_RND
-  FLOW_SEQ
-  QUEUE_MAP_RND
-  QUEUE_MAP_CPU
-  UDPCSUM
-  IPSEC
-  NODE_ALLOC
-  NO_TIMESTAMP
-
-spi (ipsec)
-
-dst_min
-dst_max
-
-src_min
-src_max
-
-dst_mac
-src_mac
-
-clear_counters
-
-src6
-dst6
-dst6_max
-dst6_min
-
-flows
-flowlen
-
-rate
-ratep
-
-xmit_mode <start_xmit|netif_receive>
-
-vlan_cfi
-vlan_id
-vlan_p
-
-svlan_cfi
-svlan_id
-svlan_p
-
-
-References:
-ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
-ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
-
-Paper from Linux-Kongress in Erlangen 2004.
-ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf
-
-Thanks to:
-Grant Grundler for testing on IA-64 and parisc, Harald Welte,  Lennert Buytenhek
-Stephen Hemminger, Andi Kleen, Dave Miller and many others.
-
-
-Good luck with the linux net-development.
diff --git a/net/Kconfig b/net/Kconfig
index 8b1f85820a6b..c5ba2d180c43 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -344,7 +344,7 @@ config NET_PKTGEN
 	  what was just said, you don't need it: say N.
 
 	  Documentation on how to use the packet generator can be found
-	  at <file:Documentation/networking/pktgen.txt>.
+	  at <file:Documentation/networking/pktgen.rst>.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called pktgen.
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 08e2811b5274..b53b6d38c4df 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -56,7 +56,7 @@
  * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
  *
  * 021124 Finished major redesign and rewrite for new functionality.
- * See Documentation/networking/pktgen.txt for how to use this.
+ * See Documentation/networking/pktgen.rst for how to use this.
  *
  * The new operation:
  * For each CPU one thread/process is created at start. This process checks
diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst
index 3f6483e8b2df..f9c53ca5cf93 100644
--- a/samples/pktgen/README.rst
+++ b/samples/pktgen/README.rst
@@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator)
 This directory contains some pktgen sample and benchmark scripts, that
 can easily be copied and adjusted for your own use-case.
 
-General doc is located in kernel: Documentation/networking/pktgen.txt
+General doc is located in kernel: Documentation/networking/pktgen.rst
 
 Helper include files
 ====================
-- 
cgit v1.2.3


From 66d495d0a5aecd1692f6b5e3190de14f9a31e14b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:04:17 +0200
Subject: docs: networking: convert radiotap-headers.txt to ReST

- add SPDX header;
- adjust title markup;
- mark code blocks and literals as such;
- adjust identation, whitespaces and blank lines where needed;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst              |   1 +
 Documentation/networking/mac80211-injection.rst |   2 +-
 Documentation/networking/radiotap-headers.rst   | 159 ++++++++++++++++++++++++
 Documentation/networking/radiotap-headers.txt   | 152 ----------------------
 include/net/cfg80211.h                          |   2 +-
 net/wireless/radiotap.c                         |   2 +-
 6 files changed, 163 insertions(+), 155 deletions(-)
 create mode 100644 Documentation/networking/radiotap-headers.rst
 delete mode 100644 Documentation/networking/radiotap-headers.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 0da7eb0ec85a..85bc52d0b3a6 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -95,6 +95,7 @@ Contents:
    plip
    ppp_generic
    proc_net_tcp
+   radiotap-headers
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst
index 75d4edcae852..be65f886ff1f 100644
--- a/Documentation/networking/mac80211-injection.rst
+++ b/Documentation/networking/mac80211-injection.rst
@@ -13,7 +13,7 @@ following format::
  [ payload ]
 
 The radiotap format is discussed in
-./Documentation/networking/radiotap-headers.txt.
+./Documentation/networking/radiotap-headers.rst.
 
 Despite many radiotap parameters being currently defined, most only make sense
 to appear on received packets.  The following information is parsed from the
diff --git a/Documentation/networking/radiotap-headers.rst b/Documentation/networking/radiotap-headers.rst
new file mode 100644
index 000000000000..1a1bd1ec0650
--- /dev/null
+++ b/Documentation/networking/radiotap-headers.rst
@@ -0,0 +1,159 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+How to use radiotap headers
+===========================
+
+Pointer to the radiotap include file
+------------------------------------
+
+Radiotap headers are variable-length and extensible, you can get most of the
+information you need to know on them from::
+
+    ./include/net/ieee80211_radiotap.h
+
+This document gives an overview and warns on some corner cases.
+
+
+Structure of the header
+-----------------------
+
+There is a fixed portion at the start which contains a u32 bitmap that defines
+if the possible argument associated with that bit is present or not.  So if b0
+of the it_present member of ieee80211_radiotap_header is set, it means that
+the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the
+argument area.
+
+::
+
+   < 8-byte ieee80211_radiotap_header >
+   [ <possible argument bitmap extensions ... > ]
+   [ <argument> ... ]
+
+At the moment there are only 13 possible argument indexes defined, but in case
+we run out of space in the u32 it_present member, it is defined that b31 set
+indicates that there is another u32 bitmap following (shown as "possible
+argument bitmap extensions..." above), and the start of the arguments is moved
+forward 4 bytes each time.
+
+Note also that the it_len member __le16 is set to the total number of bytes
+covered by the ieee80211_radiotap_header and any arguments following.
+
+
+Requirements for arguments
+--------------------------
+
+After the fixed part of the header, the arguments follow for each argument
+index whose matching bit is set in the it_present member of
+ieee80211_radiotap_header.
+
+ - the arguments are all stored little-endian!
+
+ - the argument payload for a given argument index has a fixed size.  So
+   IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is
+   present.  See the comments in ./include/net/ieee80211_radiotap.h for a nice
+   breakdown of all the argument sizes
+
+ - the arguments must be aligned to a boundary of the argument size using
+   padding.  So a u16 argument must start on the next u16 boundary if it isn't
+   already on one, a u32 must start on the next u32 boundary and so on.
+
+ - "alignment" is relative to the start of the ieee80211_radiotap_header, ie,
+   the first byte of the radiotap header.  The absolute alignment of that first
+   byte isn't defined.  So even if the whole radiotap header is starting at, eg,
+   address 0x00000003, still the first byte of the radiotap header is treated as
+   0 for alignment purposes.
+
+ - the above point that there may be no absolute alignment for multibyte
+   entities in the fixed radiotap header or the argument region means that you
+   have to take special evasive action when trying to access these multibyte
+   entities.  Some arches like Blackfin cannot deal with an attempt to
+   dereference, eg, a u16 pointer that is pointing to an odd address.  Instead
+   you have to use a kernel API get_unaligned() to dereference the pointer,
+   which will do it bytewise on the arches that require that.
+
+ - The arguments for a given argument index can be a compound of multiple types
+   together.  For example IEEE80211_RADIOTAP_CHANNEL has an argument payload
+   consisting of two u16s of total length 4.  When this happens, the padding
+   rule is applied dealing with a u16, NOT dealing with a 4-byte single entity.
+
+
+Example valid radiotap header
+-----------------------------
+
+::
+
+	0x00, 0x00, // <-- radiotap version + pad byte
+	0x0b, 0x00, // <- radiotap header length
+	0x04, 0x0c, 0x00, 0x00, // <-- bitmap
+	0x6c, // <-- rate (in 500kHz units)
+	0x0c, //<-- tx power
+	0x01 //<-- antenna
+
+
+Using the Radiotap Parser
+-------------------------
+
+If you are having to parse a radiotap struct, you can radically simplify the
+job by using the radiotap parser that lives in net/wireless/radiotap.c and has
+its prototypes available in include/net/cfg80211.h.  You use it like this::
+
+    #include <net/cfg80211.h>
+
+    /* buf points to the start of the radiotap header part */
+
+    int MyFunction(u8 * buf, int buflen)
+    {
+	    int pkt_rate_100kHz = 0, antenna = 0, pwr = 0;
+	    struct ieee80211_radiotap_iterator iterator;
+	    int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen);
+
+	    while (!ret) {
+
+		    ret = ieee80211_radiotap_iterator_next(&iterator);
+
+		    if (ret)
+			    continue;
+
+		    /* see if this argument is something we can use */
+
+		    switch (iterator.this_arg_index) {
+		    /*
+		    * You must take care when dereferencing iterator.this_arg
+		    * for multibyte types... the pointer is not aligned.  Use
+		    * get_unaligned((type *)iterator.this_arg) to dereference
+		    * iterator.this_arg for type "type" safely on all arches.
+		    */
+		    case IEEE80211_RADIOTAP_RATE:
+			    /* radiotap "rate" u8 is in
+			    * 500kbps units, eg, 0x02=1Mbps
+			    */
+			    pkt_rate_100kHz = (*iterator.this_arg) * 5;
+			    break;
+
+		    case IEEE80211_RADIOTAP_ANTENNA:
+			    /* radiotap uses 0 for 1st ant */
+			    antenna = *iterator.this_arg);
+			    break;
+
+		    case IEEE80211_RADIOTAP_DBM_TX_POWER:
+			    pwr = *iterator.this_arg;
+			    break;
+
+		    default:
+			    break;
+		    }
+	    }  /* while more rt headers */
+
+	    if (ret != -ENOENT)
+		    return TXRX_DROP;
+
+	    /* discard the radiotap header part */
+	    buf += iterator.max_length;
+	    buflen -= iterator.max_length;
+
+	    ...
+
+    }
+
+Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/radiotap-headers.txt b/Documentation/networking/radiotap-headers.txt
deleted file mode 100644
index 953331c7984f..000000000000
--- a/Documentation/networking/radiotap-headers.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-How to use radiotap headers
-===========================
-
-Pointer to the radiotap include file
-------------------------------------
-
-Radiotap headers are variable-length and extensible, you can get most of the
-information you need to know on them from:
-
-./include/net/ieee80211_radiotap.h
-
-This document gives an overview and warns on some corner cases.
-
-
-Structure of the header
------------------------
-
-There is a fixed portion at the start which contains a u32 bitmap that defines
-if the possible argument associated with that bit is present or not.  So if b0
-of the it_present member of ieee80211_radiotap_header is set, it means that
-the header for argument index 0 (IEEE80211_RADIOTAP_TSFT) is present in the
-argument area.
-
-   < 8-byte ieee80211_radiotap_header >
-   [ <possible argument bitmap extensions ... > ]
-   [ <argument> ... ]
-
-At the moment there are only 13 possible argument indexes defined, but in case
-we run out of space in the u32 it_present member, it is defined that b31 set
-indicates that there is another u32 bitmap following (shown as "possible
-argument bitmap extensions..." above), and the start of the arguments is moved
-forward 4 bytes each time.
-
-Note also that the it_len member __le16 is set to the total number of bytes
-covered by the ieee80211_radiotap_header and any arguments following.
-
-
-Requirements for arguments
---------------------------
-
-After the fixed part of the header, the arguments follow for each argument
-index whose matching bit is set in the it_present member of
-ieee80211_radiotap_header.
-
- - the arguments are all stored little-endian!
-
- - the argument payload for a given argument index has a fixed size.  So
-   IEEE80211_RADIOTAP_TSFT being present always indicates an 8-byte argument is
-   present.  See the comments in ./include/net/ieee80211_radiotap.h for a nice
-   breakdown of all the argument sizes
-
- - the arguments must be aligned to a boundary of the argument size using
-   padding.  So a u16 argument must start on the next u16 boundary if it isn't
-   already on one, a u32 must start on the next u32 boundary and so on.
-
- - "alignment" is relative to the start of the ieee80211_radiotap_header, ie,
-   the first byte of the radiotap header.  The absolute alignment of that first
-   byte isn't defined.  So even if the whole radiotap header is starting at, eg,
-   address 0x00000003, still the first byte of the radiotap header is treated as
-   0 for alignment purposes.
-
- - the above point that there may be no absolute alignment for multibyte
-   entities in the fixed radiotap header or the argument region means that you
-   have to take special evasive action when trying to access these multibyte
-   entities.  Some arches like Blackfin cannot deal with an attempt to
-   dereference, eg, a u16 pointer that is pointing to an odd address.  Instead
-   you have to use a kernel API get_unaligned() to dereference the pointer,
-   which will do it bytewise on the arches that require that.
-
- - The arguments for a given argument index can be a compound of multiple types
-   together.  For example IEEE80211_RADIOTAP_CHANNEL has an argument payload
-   consisting of two u16s of total length 4.  When this happens, the padding
-   rule is applied dealing with a u16, NOT dealing with a 4-byte single entity.
-
-
-Example valid radiotap header
------------------------------
-
-	0x00, 0x00, // <-- radiotap version + pad byte
-	0x0b, 0x00, // <- radiotap header length
-	0x04, 0x0c, 0x00, 0x00, // <-- bitmap
-	0x6c, // <-- rate (in 500kHz units)
-	0x0c, //<-- tx power
-	0x01 //<-- antenna
-
-
-Using the Radiotap Parser
--------------------------
-
-If you are having to parse a radiotap struct, you can radically simplify the
-job by using the radiotap parser that lives in net/wireless/radiotap.c and has
-its prototypes available in include/net/cfg80211.h.  You use it like this:
-
-#include <net/cfg80211.h>
-
-/* buf points to the start of the radiotap header part */
-
-int MyFunction(u8 * buf, int buflen)
-{
-	int pkt_rate_100kHz = 0, antenna = 0, pwr = 0;
-	struct ieee80211_radiotap_iterator iterator;
-	int ret = ieee80211_radiotap_iterator_init(&iterator, buf, buflen);
-
-	while (!ret) {
-
-		ret = ieee80211_radiotap_iterator_next(&iterator);
-
-		if (ret)
-			continue;
-
-		/* see if this argument is something we can use */
-
-		switch (iterator.this_arg_index) {
-		/*
-		 * You must take care when dereferencing iterator.this_arg
-		 * for multibyte types... the pointer is not aligned.  Use
-		 * get_unaligned((type *)iterator.this_arg) to dereference
-		 * iterator.this_arg for type "type" safely on all arches.
-		 */
-		case IEEE80211_RADIOTAP_RATE:
-			/* radiotap "rate" u8 is in
-			 * 500kbps units, eg, 0x02=1Mbps
-			 */
-			pkt_rate_100kHz = (*iterator.this_arg) * 5;
-			break;
-
-		case IEEE80211_RADIOTAP_ANTENNA:
-			/* radiotap uses 0 for 1st ant */
-			antenna = *iterator.this_arg);
-			break;
-
-		case IEEE80211_RADIOTAP_DBM_TX_POWER:
-			pwr = *iterator.this_arg;
-			break;
-
-		default:
-			break;
-		}
-	}  /* while more rt headers */
-
-	if (ret != -ENOENT)
-		return TXRX_DROP;
-
-	/* discard the radiotap header part */
-	buf += iterator.max_length;
-	buflen -= iterator.max_length;
-
-	...
-
-}
-
-Andy Green <andy@warmcat.com>
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 70e48f66dac8..46ac80423b28 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5211,7 +5211,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
  * Radiotap parsing functions -- for controlled injection support
  *
  * Implemented in net/wireless/radiotap.c
- * Documentation in Documentation/networking/radiotap-headers.txt
+ * Documentation in Documentation/networking/radiotap-headers.rst
  */
 
 struct radiotap_align_size {
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 6582d155e2fc..d5e28239e030 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -90,7 +90,7 @@ static const struct ieee80211_radiotap_namespace radiotap_ns = {
  * iterator.this_arg for type "type" safely on all arches.
  *
  * Example code:
- * See Documentation/networking/radiotap-headers.txt
+ * See Documentation/networking/radiotap-headers.rst
  */
 
 int ieee80211_radiotap_iterator_init(
-- 
cgit v1.2.3


From 9f72374cb5959556870be8078b128158edde5d3e Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:04:21 +0200
Subject: docs: networking: convert rxrpc.txt to ReST

- add SPDX header;
- adjust title markup;
- use autonumbered list markups;
- mark code blocks and literals as such;
- mark tables as such;
- adjust identation, whitespaces and blank lines where needed;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/afs.rst  |    2 +-
 Documentation/networking/index.rst |    1 +
 Documentation/networking/rxrpc.rst | 1169 ++++++++++++++++++++++++++++++++++++
 Documentation/networking/rxrpc.txt | 1155 -----------------------------------
 MAINTAINERS                        |    2 +-
 net/rxrpc/Kconfig                  |    6 +-
 net/rxrpc/sysctl.c                 |    2 +-
 7 files changed, 1176 insertions(+), 1161 deletions(-)
 create mode 100644 Documentation/networking/rxrpc.rst
 delete mode 100644 Documentation/networking/rxrpc.txt

(limited to 'net')

diff --git a/Documentation/filesystems/afs.rst b/Documentation/filesystems/afs.rst
index c4ec39a5966e..cada9464d6bd 100644
--- a/Documentation/filesystems/afs.rst
+++ b/Documentation/filesystems/afs.rst
@@ -70,7 +70,7 @@ list of volume location server IP addresses::
 The first module is the AF_RXRPC network protocol driver.  This provides the
 RxRPC remote operation protocol and may also be accessed from userspace.  See:
 
-	Documentation/networking/rxrpc.txt
+	Documentation/networking/rxrpc.rst
 
 The second module is the kerberos RxRPC security driver, and the third module
 is the actual filesystem driver for the AFS filesystem.
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index bc3b04a2edde..cd307b9601fa 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -99,6 +99,7 @@ Contents:
    ray_cs
    rds
    regulatory
+   rxrpc
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
new file mode 100644
index 000000000000..5ad35113d0f4
--- /dev/null
+++ b/Documentation/networking/rxrpc.rst
@@ -0,0 +1,1169 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+RxRPC Network Protocol
+======================
+
+The RxRPC protocol driver provides a reliable two-phase transport on top of UDP
+that can be used to perform RxRPC remote operations.  This is done over sockets
+of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and
+receive data, aborts and errors.
+
+Contents of this document:
+
+ (#) Overview.
+
+ (#) RxRPC protocol summary.
+
+ (#) AF_RXRPC driver model.
+
+ (#) Control messages.
+
+ (#) Socket options.
+
+ (#) Security.
+
+ (#) Example client usage.
+
+ (#) Example server usage.
+
+ (#) AF_RXRPC kernel interface.
+
+ (#) Configurable parameters.
+
+
+Overview
+========
+
+RxRPC is a two-layer protocol.  There is a session layer which provides
+reliable virtual connections using UDP over IPv4 (or IPv6) as the transport
+layer, but implements a real network protocol; and there's the presentation
+layer which renders structured data to binary blobs and back again using XDR
+(as does SunRPC)::
+
+		+-------------+
+		| Application |
+		+-------------+
+		|     XDR     |		Presentation
+		+-------------+
+		|    RxRPC    |		Session
+		+-------------+
+		|     UDP     |		Transport
+		+-------------+
+
+
+AF_RXRPC provides:
+
+ (1) Part of an RxRPC facility for both kernel and userspace applications by
+     making the session part of it a Linux network protocol (AF_RXRPC).
+
+ (2) A two-phase protocol.  The client transmits a blob (the request) and then
+     receives a blob (the reply), and the server receives the request and then
+     transmits the reply.
+
+ (3) Retention of the reusable bits of the transport system set up for one call
+     to speed up subsequent calls.
+
+ (4) A secure protocol, using the Linux kernel's key retention facility to
+     manage security on the client end.  The server end must of necessity be
+     more active in security negotiations.
+
+AF_RXRPC does not provide XDR marshalling/presentation facilities.  That is
+left to the application.  AF_RXRPC only deals in blobs.  Even the operation ID
+is just the first four bytes of the request blob, and as such is beyond the
+kernel's interest.
+
+
+Sockets of AF_RXRPC family are:
+
+ (1) created as type SOCK_DGRAM;
+
+ (2) provided with a protocol of the type of underlying transport they're going
+     to use - currently only PF_INET is supported.
+
+
+The Andrew File System (AFS) is an example of an application that uses this and
+that has both kernel (filesystem) and userspace (utility) components.
+
+
+RxRPC Protocol Summary
+======================
+
+An overview of the RxRPC protocol:
+
+ (#) RxRPC sits on top of another networking protocol (UDP is the only option
+     currently), and uses this to provide network transport.  UDP ports, for
+     example, provide transport endpoints.
+
+ (#) RxRPC supports multiple virtual "connections" from any given transport
+     endpoint, thus allowing the endpoints to be shared, even to the same
+     remote endpoint.
+
+ (#) Each connection goes to a particular "service".  A connection may not go
+     to multiple services.  A service may be considered the RxRPC equivalent of
+     a port number.  AF_RXRPC permits multiple services to share an endpoint.
+
+ (#) Client-originating packets are marked, thus a transport endpoint can be
+     shared between client and server connections (connections have a
+     direction).
+
+ (#) Up to a billion connections may be supported concurrently between one
+     local transport endpoint and one service on one remote endpoint.  An RxRPC
+     connection is described by seven numbers::
+
+	Local address	}
+	Local port	} Transport (UDP) address
+	Remote address	}
+	Remote port	}
+	Direction
+	Connection ID
+	Service ID
+
+ (#) Each RxRPC operation is a "call".  A connection may make up to four
+     billion calls, but only up to four calls may be in progress on a
+     connection at any one time.
+
+ (#) Calls are two-phase and asymmetric: the client sends its request data,
+     which the service receives; then the service transmits the reply data
+     which the client receives.
+
+ (#) The data blobs are of indefinite size, the end of a phase is marked with a
+     flag in the packet.  The number of packets of data making up one blob may
+     not exceed 4 billion, however, as this would cause the sequence number to
+     wrap.
+
+ (#) The first four bytes of the request data are the service operation ID.
+
+ (#) Security is negotiated on a per-connection basis.  The connection is
+     initiated by the first data packet on it arriving.  If security is
+     requested, the server then issues a "challenge" and then the client
+     replies with a "response".  If the response is successful, the security is
+     set for the lifetime of that connection, and all subsequent calls made
+     upon it use that same security.  In the event that the server lets a
+     connection lapse before the client, the security will be renegotiated if
+     the client uses the connection again.
+
+ (#) Calls use ACK packets to handle reliability.  Data packets are also
+     explicitly sequenced per call.
+
+ (#) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs.
+     A hard-ACK indicates to the far side that all the data received to a point
+     has been received and processed; a soft-ACK indicates that the data has
+     been received but may yet be discarded and re-requested.  The sender may
+     not discard any transmittable packets until they've been hard-ACK'd.
+
+ (#) Reception of a reply data packet implicitly hard-ACK's all the data
+     packets that make up the request.
+
+ (#) An call is complete when the request has been sent, the reply has been
+     received and the final hard-ACK on the last packet of the reply has
+     reached the server.
+
+ (#) An call may be aborted by either end at any time up to its completion.
+
+
+AF_RXRPC Driver Model
+=====================
+
+About the AF_RXRPC driver:
+
+ (#) The AF_RXRPC protocol transparently uses internal sockets of the transport
+     protocol to represent transport endpoints.
+
+ (#) AF_RXRPC sockets map onto RxRPC connection bundles.  Actual RxRPC
+     connections are handled transparently.  One client socket may be used to
+     make multiple simultaneous calls to the same service.  One server socket
+     may handle calls from many clients.
+
+ (#) Additional parallel client connections will be initiated to support extra
+     concurrent calls, up to a tunable limit.
+
+ (#) Each connection is retained for a certain amount of time [tunable] after
+     the last call currently using it has completed in case a new call is made
+     that could reuse it.
+
+ (#) Each internal UDP socket is retained [tunable] for a certain amount of
+     time [tunable] after the last connection using it discarded, in case a new
+     connection is made that could use it.
+
+ (#) A client-side connection is only shared between calls if they have have
+     the same key struct describing their security (and assuming the calls
+     would otherwise share the connection).  Non-secured calls would also be
+     able to share connections with each other.
+
+ (#) A server-side connection is shared if the client says it is.
+
+ (#) ACK'ing is handled by the protocol driver automatically, including ping
+     replying.
+
+ (#) SO_KEEPALIVE automatically pings the other side to keep the connection
+     alive [TODO].
+
+ (#) If an ICMP error is received, all calls affected by that error will be
+     aborted with an appropriate network error passed through recvmsg().
+
+
+Interaction with the user of the RxRPC socket:
+
+ (#) A socket is made into a server socket by binding an address with a
+     non-zero service ID.
+
+ (#) In the client, sending a request is achieved with one or more sendmsgs,
+     followed by the reply being received with one or more recvmsgs.
+
+ (#) The first sendmsg for a request to be sent from a client contains a tag to
+     be used in all other sendmsgs or recvmsgs associated with that call.  The
+     tag is carried in the control data.
+
+ (#) connect() is used to supply a default destination address for a client
+     socket.  This may be overridden by supplying an alternate address to the
+     first sendmsg() of a call (struct msghdr::msg_name).
+
+ (#) If connect() is called on an unbound client, a random local port will
+     bound before the operation takes place.
+
+ (#) A server socket may also be used to make client calls.  To do this, the
+     first sendmsg() of the call must specify the target address.  The server's
+     transport endpoint is used to send the packets.
+
+ (#) Once the application has received the last message associated with a call,
+     the tag is guaranteed not to be seen again, and so it can be used to pin
+     client resources.  A new call can then be initiated with the same tag
+     without fear of interference.
+
+ (#) In the server, a request is received with one or more recvmsgs, then the
+     the reply is transmitted with one or more sendmsgs, and then the final ACK
+     is received with a last recvmsg.
+
+ (#) When sending data for a call, sendmsg is given MSG_MORE if there's more
+     data to come on that call.
+
+ (#) When receiving data for a call, recvmsg flags MSG_MORE if there's more
+     data to come for that call.
+
+ (#) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg
+     to indicate the terminal message for that call.
+
+ (#) A call may be aborted by adding an abort control message to the control
+     data.  Issuing an abort terminates the kernel's use of that call's tag.
+     Any messages waiting in the receive queue for that call will be discarded.
+
+ (#) Aborts, busy notifications and challenge packets are delivered by recvmsg,
+     and control data messages will be set to indicate the context.  Receiving
+     an abort or a busy message terminates the kernel's use of that call's tag.
+
+ (#) The control data part of the msghdr struct is used for a number of things:
+
+     (#) The tag of the intended or affected call.
+
+     (#) Sending or receiving errors, aborts and busy notifications.
+
+     (#) Notifications of incoming calls.
+
+     (#) Sending debug requests and receiving debug replies [TODO].
+
+ (#) When the kernel has received and set up an incoming call, it sends a
+     message to server application to let it know there's a new call awaiting
+     its acceptance [recvmsg reports a special control message].  The server
+     application then uses sendmsg to assign a tag to the new call.  Once that
+     is done, the first part of the request data will be delivered by recvmsg.
+
+ (#) The server application has to provide the server socket with a keyring of
+     secret keys corresponding to the security types it permits.  When a secure
+     connection is being set up, the kernel looks up the appropriate secret key
+     in the keyring and then sends a challenge packet to the client and
+     receives a response packet.  The kernel then checks the authorisation of
+     the packet and either aborts the connection or sets up the security.
+
+ (#) The name of the key a client will use to secure its communications is
+     nominated by a socket option.
+
+
+Notes on sendmsg:
+
+ (#) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is
+     making progress at accepting packets within a reasonable time such that we
+     manage to queue up all the data for transmission.  This requires the
+     client to accept at least one packet per 2*RTT time period.
+
+     If this isn't set, sendmsg() will return immediately, either returning
+     EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data
+     consumed.
+
+
+Notes on recvmsg:
+
+ (#) If there's a sequence of data messages belonging to a particular call on
+     the receive queue, then recvmsg will keep working through them until:
+
+     (a) it meets the end of that call's received data,
+
+     (b) it meets a non-data message,
+
+     (c) it meets a message belonging to a different call, or
+
+     (d) it fills the user buffer.
+
+     If recvmsg is called in blocking mode, it will keep sleeping, awaiting the
+     reception of further data, until one of the above four conditions is met.
+
+ (2) MSG_PEEK operates similarly, but will return immediately if it has put any
+     data in the buffer rather than sleeping until it can fill the buffer.
+
+ (3) If a data message is only partially consumed in filling a user buffer,
+     then the remainder of that message will be left on the front of the queue
+     for the next taker.  MSG_TRUNC will never be flagged.
+
+ (4) If there is more data to be had on a call (it hasn't copied the last byte
+     of the last data message in that phase yet), then MSG_MORE will be
+     flagged.
+
+
+Control Messages
+================
+
+AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex
+calls, to invoke certain actions and to report certain conditions.  These are:
+
+	=======================	=== ===========	===============================
+	MESSAGE ID		SRT DATA	MEANING
+	=======================	=== ===========	===============================
+	RXRPC_USER_CALL_ID	sr- User ID	App's call specifier
+	RXRPC_ABORT		srt Abort code	Abort code to issue/received
+	RXRPC_ACK		-rt n/a		Final ACK received
+	RXRPC_NET_ERROR		-rt error num	Network error on call
+	RXRPC_BUSY		-rt n/a		Call rejected (server busy)
+	RXRPC_LOCAL_ERROR	-rt error num	Local error encountered
+	RXRPC_NEW_CALL		-r- n/a		New call received
+	RXRPC_ACCEPT		s-- n/a		Accept new call
+	RXRPC_EXCLUSIVE_CALL	s-- n/a		Make an exclusive client call
+	RXRPC_UPGRADE_SERVICE	s-- n/a		Client call can be upgraded
+	RXRPC_TX_LENGTH		s-- data len	Total length of Tx data
+	=======================	=== ===========	===============================
+
+	(SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message)
+
+ (#) RXRPC_USER_CALL_ID
+
+     This is used to indicate the application's call ID.  It's an unsigned long
+     that the app specifies in the client by attaching it to the first data
+     message or in the server by passing it in association with an RXRPC_ACCEPT
+     message.  recvmsg() passes it in conjunction with all messages except
+     those of the RXRPC_NEW_CALL message.
+
+ (#) RXRPC_ABORT
+
+     This is can be used by an application to abort a call by passing it to
+     sendmsg, or it can be delivered by recvmsg to indicate a remote abort was
+     received.  Either way, it must be associated with an RXRPC_USER_CALL_ID to
+     specify the call affected.  If an abort is being sent, then error EBADSLT
+     will be returned if there is no call with that user ID.
+
+ (#) RXRPC_ACK
+
+     This is delivered to a server application to indicate that the final ACK
+     of a call was received from the client.  It will be associated with an
+     RXRPC_USER_CALL_ID to indicate the call that's now complete.
+
+ (#) RXRPC_NET_ERROR
+
+     This is delivered to an application to indicate that an ICMP error message
+     was encountered in the process of trying to talk to the peer.  An
+     errno-class integer value will be included in the control message data
+     indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
+     affected.
+
+ (#) RXRPC_BUSY
+
+     This is delivered to a client application to indicate that a call was
+     rejected by the server due to the server being busy.  It will be
+     associated with an RXRPC_USER_CALL_ID to indicate the rejected call.
+
+ (#) RXRPC_LOCAL_ERROR
+
+     This is delivered to an application to indicate that a local error was
+     encountered and that a call has been aborted because of it.  An
+     errno-class integer value will be included in the control message data
+     indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
+     affected.
+
+ (#) RXRPC_NEW_CALL
+
+     This is delivered to indicate to a server application that a new call has
+     arrived and is awaiting acceptance.  No user ID is associated with this,
+     as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT.
+
+ (#) RXRPC_ACCEPT
+
+     This is used by a server application to attempt to accept a call and
+     assign it a user ID.  It should be associated with an RXRPC_USER_CALL_ID
+     to indicate the user ID to be assigned.  If there is no call to be
+     accepted (it may have timed out, been aborted, etc.), then sendmsg will
+     return error ENODATA.  If the user ID is already in use by another call,
+     then error EBADSLT will be returned.
+
+ (#) RXRPC_EXCLUSIVE_CALL
+
+     This is used to indicate that a client call should be made on a one-off
+     connection.  The connection is discarded once the call has terminated.
+
+ (#) RXRPC_UPGRADE_SERVICE
+
+     This is used to make a client call to probe if the specified service ID
+     may be upgraded by the server.  The caller must check msg_name returned to
+     recvmsg() for the service ID actually in use.  The operation probed must
+     be one that takes the same arguments in both services.
+
+     Once this has been used to establish the upgrade capability (or lack
+     thereof) of the server, the service ID returned should be used for all
+     future communication to that server and RXRPC_UPGRADE_SERVICE should no
+     longer be set.
+
+ (#) RXRPC_TX_LENGTH
+
+     This is used to inform the kernel of the total amount of data that is
+     going to be transmitted by a call (whether in a client request or a
+     service response).  If given, it allows the kernel to encrypt from the
+     userspace buffer directly to the packet buffers, rather than copying into
+     the buffer and then encrypting in place.  This may only be given with the
+     first sendmsg() providing data for a call.  EMSGSIZE will be generated if
+     the amount of data actually given is different.
+
+     This takes a parameter of __s64 type that indicates how much will be
+     transmitted.  This may not be less than zero.
+
+The symbol RXRPC__SUPPORTED is defined as one more than the highest control
+message type supported.  At run time this can be queried by means of the
+RXRPC_SUPPORTED_CMSG socket option (see below).
+
+
+==============
+SOCKET OPTIONS
+==============
+
+AF_RXRPC sockets support a few socket options at the SOL_RXRPC level:
+
+ (#) RXRPC_SECURITY_KEY
+
+     This is used to specify the description of the key to be used.  The key is
+     extracted from the calling process's keyrings with request_key() and
+     should be of "rxrpc" type.
+
+     The optval pointer points to the description string, and optlen indicates
+     how long the string is, without the NUL terminator.
+
+ (#) RXRPC_SECURITY_KEYRING
+
+     Similar to above but specifies a keyring of server secret keys to use (key
+     type "keyring").  See the "Security" section.
+
+ (#) RXRPC_EXCLUSIVE_CONNECTION
+
+     This is used to request that new connections should be used for each call
+     made subsequently on this socket.  optval should be NULL and optlen 0.
+
+ (#) RXRPC_MIN_SECURITY_LEVEL
+
+     This is used to specify the minimum security level required for calls on
+     this socket.  optval must point to an int containing one of the following
+     values:
+
+     (a) RXRPC_SECURITY_PLAIN
+
+	 Encrypted checksum only.
+
+     (b) RXRPC_SECURITY_AUTH
+
+	 Encrypted checksum plus packet padded and first eight bytes of packet
+	 encrypted - which includes the actual packet length.
+
+     (c) RXRPC_SECURITY_ENCRYPTED
+
+	 Encrypted checksum plus entire packet padded and encrypted, including
+	 actual packet length.
+
+ (#) RXRPC_UPGRADEABLE_SERVICE
+
+     This is used to indicate that a service socket with two bindings may
+     upgrade one bound service to the other if requested by the client.  optval
+     must point to an array of two unsigned short ints.  The first is the
+     service ID to upgrade from and the second the service ID to upgrade to.
+
+ (#) RXRPC_SUPPORTED_CMSG
+
+     This is a read-only option that writes an int into the buffer indicating
+     the highest control message type supported.
+
+
+========
+SECURITY
+========
+
+Currently, only the kerberos 4 equivalent protocol has been implemented
+(security index 2 - rxkad).  This requires the rxkad module to be loaded and,
+on the client, tickets of the appropriate type to be obtained from the AFS
+kaserver or the kerberos server and installed as "rxrpc" type keys.  This is
+normally done using the klog program.  An example simple klog program can be
+found at:
+
+	http://people.redhat.com/~dhowells/rxrpc/klog.c
+
+The payload provided to add_key() on the client should be of the following
+form::
+
+	struct rxrpc_key_sec2_v1 {
+		uint16_t	security_index;	/* 2 */
+		uint16_t	ticket_length;	/* length of ticket[] */
+		uint32_t	expiry;		/* time at which expires */
+		uint8_t		kvno;		/* key version number */
+		uint8_t		__pad[3];
+		uint8_t		session_key[8];	/* DES session key */
+		uint8_t		ticket[0];	/* the encrypted ticket */
+	};
+
+Where the ticket blob is just appended to the above structure.
+
+
+For the server, keys of type "rxrpc_s" must be made available to the server.
+They have a description of "<serviceID>:<securityIndex>" (eg: "52:2" for an
+rxkad key for the AFS VL service).  When such a key is created, it should be
+given the server's secret key as the instantiation data (see the example
+below).
+
+	add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
+
+A keyring is passed to the server socket by naming it in a sockopt.  The server
+socket then looks the server secret keys up in this keyring when secure
+incoming connections are made.  This can be seen in an example program that can
+be found at:
+
+	http://people.redhat.com/~dhowells/rxrpc/listen.c
+
+
+====================
+EXAMPLE CLIENT USAGE
+====================
+
+A client would issue an operation by:
+
+ (1) An RxRPC socket is set up by::
+
+	client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
+
+     Where the third parameter indicates the protocol family of the transport
+     socket used - usually IPv4 but it can also be IPv6 [TODO].
+
+ (2) A local address can optionally be bound::
+
+	struct sockaddr_rxrpc srx = {
+		.srx_family	= AF_RXRPC,
+		.srx_service	= 0,  /* we're a client */
+		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
+		.transport.sin_family	= AF_INET,
+		.transport.sin_port	= htons(7000), /* AFS callback */
+		.transport.sin_address	= 0,  /* all local interfaces */
+	};
+	bind(client, &srx, sizeof(srx));
+
+     This specifies the local UDP port to be used.  If not given, a random
+     non-privileged port will be used.  A UDP port may be shared between
+     several unrelated RxRPC sockets.  Security is handled on a basis of
+     per-RxRPC virtual connection.
+
+ (3) The security is set::
+
+	const char *key = "AFS:cambridge.redhat.com";
+	setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key));
+
+     This issues a request_key() to get the key representing the security
+     context.  The minimum security level can be set::
+
+	unsigned int sec = RXRPC_SECURITY_ENCRYPTED;
+	setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
+		   &sec, sizeof(sec));
+
+ (4) The server to be contacted can then be specified (alternatively this can
+     be done through sendmsg)::
+
+	struct sockaddr_rxrpc srx = {
+		.srx_family	= AF_RXRPC,
+		.srx_service	= VL_SERVICE_ID,
+		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
+		.transport.sin_family	= AF_INET,
+		.transport.sin_port	= htons(7005), /* AFS volume manager */
+		.transport.sin_address	= ...,
+	};
+	connect(client, &srx, sizeof(srx));
+
+ (5) The request data should then be posted to the server socket using a series
+     of sendmsg() calls, each with the following control message attached:
+
+	==================	===================================
+	RXRPC_USER_CALL_ID	specifies the user ID for this call
+	==================	===================================
+
+     MSG_MORE should be set in msghdr::msg_flags on all but the last part of
+     the request.  Multiple requests may be made simultaneously.
+
+     An RXRPC_TX_LENGTH control message can also be specified on the first
+     sendmsg() call.
+
+     If a call is intended to go to a destination other than the default
+     specified through connect(), then msghdr::msg_name should be set on the
+     first request message of that call.
+
+ (6) The reply data will then be posted to the server socket for recvmsg() to
+     pick up.  MSG_MORE will be flagged by recvmsg() if there's more reply data
+     for a particular call to be read.  MSG_EOR will be set on the terminal
+     read for a call.
+
+     All data will be delivered with the following control message attached:
+
+	RXRPC_USER_CALL_ID	- specifies the user ID for this call
+
+     If an abort or error occurred, this will be returned in the control data
+     buffer instead, and MSG_EOR will be flagged to indicate the end of that
+     call.
+
+A client may ask for a service ID it knows and ask that this be upgraded to a
+better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the
+first sendmsg() of a call.  The client should then check srx_service in the
+msg_name filled in by recvmsg() when collecting the result.  srx_service will
+hold the same value as given to sendmsg() if the upgrade request was ignored by
+the service - otherwise it will be altered to indicate the service ID the
+server upgraded to.  Note that the upgraded service ID is chosen by the server.
+The caller has to wait until it sees the service ID in the reply before sending
+any more calls (further calls to the same destination will be blocked until the
+probe is concluded).
+
+
+Example Server Usage
+====================
+
+A server would be set up to accept operations in the following manner:
+
+ (1) An RxRPC socket is created by::
+
+	server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
+
+     Where the third parameter indicates the address type of the transport
+     socket used - usually IPv4.
+
+ (2) Security is set up if desired by giving the socket a keyring with server
+     secret keys in it::
+
+	keyring = add_key("keyring", "AFSkeys", NULL, 0,
+			  KEY_SPEC_PROCESS_KEYRING);
+
+	const char secret_key[8] = {
+		0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 };
+	add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
+
+	setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7);
+
+     The keyring can be manipulated after it has been given to the socket. This
+     permits the server to add more keys, replace keys, etc. while it is live.
+
+ (3) A local address must then be bound::
+
+	struct sockaddr_rxrpc srx = {
+		.srx_family	= AF_RXRPC,
+		.srx_service	= VL_SERVICE_ID, /* RxRPC service ID */
+		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
+		.transport.sin_family	= AF_INET,
+		.transport.sin_port	= htons(7000), /* AFS callback */
+		.transport.sin_address	= 0,  /* all local interfaces */
+	};
+	bind(server, &srx, sizeof(srx));
+
+     More than one service ID may be bound to a socket, provided the transport
+     parameters are the same.  The limit is currently two.  To do this, bind()
+     should be called twice.
+
+ (4) If service upgrading is required, first two service IDs must have been
+     bound and then the following option must be set::
+
+	unsigned short service_ids[2] = { from_ID, to_ID };
+	setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE,
+		   service_ids, sizeof(service_ids));
+
+     This will automatically upgrade connections on service from_ID to service
+     to_ID if they request it.  This will be reflected in msg_name obtained
+     through recvmsg() when the request data is delivered to userspace.
+
+ (5) The server is then set to listen out for incoming calls::
+
+	listen(server, 100);
+
+ (6) The kernel notifies the server of pending incoming connections by sending
+     it a message for each.  This is received with recvmsg() on the server
+     socket.  It has no data, and has a single dataless control message
+     attached::
+
+	RXRPC_NEW_CALL
+
+     The address that can be passed back by recvmsg() at this point should be
+     ignored since the call for which the message was posted may have gone by
+     the time it is accepted - in which case the first call still on the queue
+     will be accepted.
+
+ (7) The server then accepts the new call by issuing a sendmsg() with two
+     pieces of control data and no actual data:
+
+	==================	==============================
+	RXRPC_ACCEPT		indicate connection acceptance
+	RXRPC_USER_CALL_ID	specify user ID for this call
+	==================	==============================
+
+ (8) The first request data packet will then be posted to the server socket for
+     recvmsg() to pick up.  At that point, the RxRPC address for the call can
+     be read from the address fields in the msghdr struct.
+
+     Subsequent request data will be posted to the server socket for recvmsg()
+     to collect as it arrives.  All but the last piece of the request data will
+     be delivered with MSG_MORE flagged.
+
+     All data will be delivered with the following control message attached:
+
+
+	==================	===================================
+	RXRPC_USER_CALL_ID	specifies the user ID for this call
+	==================	===================================
+
+ (9) The reply data should then be posted to the server socket using a series
+     of sendmsg() calls, each with the following control messages attached:
+
+	==================	===================================
+	RXRPC_USER_CALL_ID	specifies the user ID for this call
+	==================	===================================
+
+     MSG_MORE should be set in msghdr::msg_flags on all but the last message
+     for a particular call.
+
+(10) The final ACK from the client will be posted for retrieval by recvmsg()
+     when it is received.  It will take the form of a dataless message with two
+     control messages attached:
+
+	==================	===================================
+	RXRPC_USER_CALL_ID	specifies the user ID for this call
+	RXRPC_ACK		indicates final ACK (no data)
+	==================	===================================
+
+     MSG_EOR will be flagged to indicate that this is the final message for
+     this call.
+
+(11) Up to the point the final packet of reply data is sent, the call can be
+     aborted by calling sendmsg() with a dataless message with the following
+     control messages attached:
+
+	==================	===================================
+	RXRPC_USER_CALL_ID	specifies the user ID for this call
+	RXRPC_ABORT		indicates abort code (4 byte data)
+	==================	===================================
+
+     Any packets waiting in the socket's receive queue will be discarded if
+     this is issued.
+
+Note that all the communications for a particular service take place through
+the one server socket, using control messages on sendmsg() and recvmsg() to
+determine the call affected.
+
+
+AF_RXRPC Kernel Interface
+=========================
+
+The AF_RXRPC module also provides an interface for use by in-kernel utilities
+such as the AFS filesystem.  This permits such a utility to:
+
+ (1) Use different keys directly on individual client calls on one socket
+     rather than having to open a whole slew of sockets, one for each key it
+     might want to use.
+
+ (2) Avoid having RxRPC call request_key() at the point of issue of a call or
+     opening of a socket.  Instead the utility is responsible for requesting a
+     key at the appropriate point.  AFS, for instance, would do this during VFS
+     operations such as open() or unlink().  The key is then handed through
+     when the call is initiated.
+
+ (3) Request the use of something other than GFP_KERNEL to allocate memory.
+
+ (4) Avoid the overhead of using the recvmsg() call.  RxRPC messages can be
+     intercepted before they get put into the socket Rx queue and the socket
+     buffers manipulated directly.
+
+To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
+bind an address as appropriate and listen if it's to be a server socket, but
+then it passes this to the kernel interface functions.
+
+The kernel interface functions are as follows:
+
+ (#) Begin a new client call::
+
+	struct rxrpc_call *
+	rxrpc_kernel_begin_call(struct socket *sock,
+				struct sockaddr_rxrpc *srx,
+				struct key *key,
+				unsigned long user_call_ID,
+				s64 tx_total_len,
+				gfp_t gfp,
+				rxrpc_notify_rx_t notify_rx,
+				bool upgrade,
+				bool intr,
+				unsigned int debug_id);
+
+     This allocates the infrastructure to make a new RxRPC call and assigns
+     call and connection numbers.  The call will be made on the UDP port that
+     the socket is bound to.  The call will go to the destination address of a
+     connected client socket unless an alternative is supplied (srx is
+     non-NULL).
+
+     If a key is supplied then this will be used to secure the call instead of
+     the key bound to the socket with the RXRPC_SECURITY_KEY sockopt.  Calls
+     secured in this way will still share connections if at all possible.
+
+     The user_call_ID is equivalent to that supplied to sendmsg() in the
+     control data buffer.  It is entirely feasible to use this to point to a
+     kernel data structure.
+
+     tx_total_len is the amount of data the caller is intending to transmit
+     with this call (or -1 if unknown at this point).  Setting the data size
+     allows the kernel to encrypt directly to the packet buffers, thereby
+     saving a copy.  The value may not be less than -1.
+
+     notify_rx is a pointer to a function to be called when events such as
+     incoming data packets or remote aborts happen.
+
+     upgrade should be set to true if a client operation should request that
+     the server upgrade the service to a better one.  The resultant service ID
+     is returned by rxrpc_kernel_recv_data().
+
+     intr should be set to true if the call should be interruptible.  If this
+     is not set, this function may not return until a channel has been
+     allocated; if it is set, the function may return -ERESTARTSYS.
+
+     debug_id is the call debugging ID to be used for tracing.  This can be
+     obtained by atomically incrementing rxrpc_debug_id.
+
+     If this function is successful, an opaque reference to the RxRPC call is
+     returned.  The caller now holds a reference on this and it must be
+     properly ended.
+
+ (#) End a client call::
+
+	void rxrpc_kernel_end_call(struct socket *sock,
+				   struct rxrpc_call *call);
+
+     This is used to end a previously begun call.  The user_call_ID is expunged
+     from AF_RXRPC's knowledge and will not be seen again in association with
+     the specified call.
+
+ (#) Send data through a call::
+
+	typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk,
+					      unsigned long user_call_ID,
+					      struct sk_buff *skb);
+
+	int rxrpc_kernel_send_data(struct socket *sock,
+				   struct rxrpc_call *call,
+				   struct msghdr *msg,
+				   size_t len,
+				   rxrpc_notify_end_tx_t notify_end_rx);
+
+     This is used to supply either the request part of a client call or the
+     reply part of a server call.  msg.msg_iovlen and msg.msg_iov specify the
+     data buffers to be used.  msg_iov may not be NULL and must point
+     exclusively to in-kernel virtual addresses.  msg.msg_flags may be given
+     MSG_MORE if there will be subsequent data sends for this call.
+
+     The msg must not specify a destination address, control data or any flags
+     other than MSG_MORE.  len is the total amount of data to transmit.
+
+     notify_end_rx can be NULL or it can be used to specify a function to be
+     called when the call changes state to end the Tx phase.  This function is
+     called with the call-state spinlock held to prevent any reply or final ACK
+     from being delivered first.
+
+ (#) Receive data from a call::
+
+	int rxrpc_kernel_recv_data(struct socket *sock,
+				   struct rxrpc_call *call,
+				   void *buf,
+				   size_t size,
+				   size_t *_offset,
+				   bool want_more,
+				   u32 *_abort,
+				   u16 *_service)
+
+      This is used to receive data from either the reply part of a client call
+      or the request part of a service call.  buf and size specify how much
+      data is desired and where to store it.  *_offset is added on to buf and
+      subtracted from size internally; the amount copied into the buffer is
+      added to *_offset before returning.
+
+      want_more should be true if further data will be required after this is
+      satisfied and false if this is the last item of the receive phase.
+
+      There are three normal returns: 0 if the buffer was filled and want_more
+      was true; 1 if the buffer was filled, the last DATA packet has been
+      emptied and want_more was false; and -EAGAIN if the function needs to be
+      called again.
+
+      If the last DATA packet is processed but the buffer contains less than
+      the amount requested, EBADMSG is returned.  If want_more wasn't set, but
+      more data was available, EMSGSIZE is returned.
+
+      If a remote ABORT is detected, the abort code received will be stored in
+      ``*_abort`` and ECONNABORTED will be returned.
+
+      The service ID that the call ended up with is returned into *_service.
+      This can be used to see if a call got a service upgrade.
+
+ (#) Abort a call??
+
+     ::
+
+	void rxrpc_kernel_abort_call(struct socket *sock,
+				     struct rxrpc_call *call,
+				     u32 abort_code);
+
+     This is used to abort a call if it's still in an abortable state.  The
+     abort code specified will be placed in the ABORT message sent.
+
+ (#) Intercept received RxRPC messages::
+
+	typedef void (*rxrpc_interceptor_t)(struct sock *sk,
+					    unsigned long user_call_ID,
+					    struct sk_buff *skb);
+
+	void
+	rxrpc_kernel_intercept_rx_messages(struct socket *sock,
+					   rxrpc_interceptor_t interceptor);
+
+     This installs an interceptor function on the specified AF_RXRPC socket.
+     All messages that would otherwise wind up in the socket's Rx queue are
+     then diverted to this function.  Note that care must be taken to process
+     the messages in the right order to maintain DATA message sequentiality.
+
+     The interceptor function itself is provided with the address of the socket
+     and handling the incoming message, the ID assigned by the kernel utility
+     to the call and the socket buffer containing the message.
+
+     The skb->mark field indicates the type of message:
+
+	===============================	=======================================
+	Mark				Meaning
+	===============================	=======================================
+	RXRPC_SKB_MARK_DATA		Data message
+	RXRPC_SKB_MARK_FINAL_ACK	Final ACK received for an incoming call
+	RXRPC_SKB_MARK_BUSY		Client call rejected as server busy
+	RXRPC_SKB_MARK_REMOTE_ABORT	Call aborted by peer
+	RXRPC_SKB_MARK_NET_ERROR	Network error detected
+	RXRPC_SKB_MARK_LOCAL_ERROR	Local error encountered
+	RXRPC_SKB_MARK_NEW_CALL		New incoming call awaiting acceptance
+	===============================	=======================================
+
+     The remote abort message can be probed with rxrpc_kernel_get_abort_code().
+     The two error messages can be probed with rxrpc_kernel_get_error_number().
+     A new call can be accepted with rxrpc_kernel_accept_call().
+
+     Data messages can have their contents extracted with the usual bunch of
+     socket buffer manipulation functions.  A data message can be determined to
+     be the last one in a sequence with rxrpc_kernel_is_data_last().  When a
+     data message has been used up, rxrpc_kernel_data_consumed() should be
+     called on it.
+
+     Messages should be handled to rxrpc_kernel_free_skb() to dispose of.  It
+     is possible to get extra refs on all types of message for later freeing,
+     but this may pin the state of a call until the message is finally freed.
+
+ (#) Accept an incoming call::
+
+	struct rxrpc_call *
+	rxrpc_kernel_accept_call(struct socket *sock,
+				 unsigned long user_call_ID);
+
+     This is used to accept an incoming call and to assign it a call ID.  This
+     function is similar to rxrpc_kernel_begin_call() and calls accepted must
+     be ended in the same way.
+
+     If this function is successful, an opaque reference to the RxRPC call is
+     returned.  The caller now holds a reference on this and it must be
+     properly ended.
+
+ (#) Reject an incoming call::
+
+	int rxrpc_kernel_reject_call(struct socket *sock);
+
+     This is used to reject the first incoming call on the socket's queue with
+     a BUSY message.  -ENODATA is returned if there were no incoming calls.
+     Other errors may be returned if the call had been aborted (-ECONNABORTED)
+     or had timed out (-ETIME).
+
+ (#) Allocate a null key for doing anonymous security::
+
+	struct key *rxrpc_get_null_key(const char *keyname);
+
+     This is used to allocate a null RxRPC key that can be used to indicate
+     anonymous security for a particular domain.
+
+ (#) Get the peer address of a call::
+
+	void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
+				   struct sockaddr_rxrpc *_srx);
+
+     This is used to find the remote peer address of a call.
+
+ (#) Set the total transmit data size on a call::
+
+	void rxrpc_kernel_set_tx_length(struct socket *sock,
+					struct rxrpc_call *call,
+					s64 tx_total_len);
+
+     This sets the amount of data that the caller is intending to transmit on a
+     call.  It's intended to be used for setting the reply size as the request
+     size should be set when the call is begun.  tx_total_len may not be less
+     than zero.
+
+ (#) Get call RTT::
+
+	u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call);
+
+     Get the RTT time to the peer in use by a call.  The value returned is in
+     nanoseconds.
+
+ (#) Check call still alive::
+
+	bool rxrpc_kernel_check_life(struct socket *sock,
+				     struct rxrpc_call *call,
+				     u32 *_life);
+	void rxrpc_kernel_probe_life(struct socket *sock,
+				     struct rxrpc_call *call);
+
+     The first function passes back in ``*_life`` a number that is updated when
+     ACKs are received from the peer (notably including PING RESPONSE ACKs
+     which we can elicit by sending PING ACKs to see if the call still exists
+     on the server).  The caller should compare the numbers of two calls to see
+     if the call is still alive after waiting for a suitable interval.  It also
+     returns true as long as the call hasn't yet reached the completed state.
+
+     This allows the caller to work out if the server is still contactable and
+     if the call is still alive on the server while waiting for the server to
+     process a client operation.
+
+     The second function causes a ping ACK to be transmitted to try to provoke
+     the peer into responding, which would then cause the value returned by the
+     first function to change.  Note that this must be called in TASK_RUNNING
+     state.
+
+ (#) Get reply timestamp::
+
+	bool rxrpc_kernel_get_reply_time(struct socket *sock,
+					 struct rxrpc_call *call,
+					 ktime_t *_ts)
+
+     This allows the timestamp on the first DATA packet of the reply of a
+     client call to be queried, provided that it is still in the Rx ring.  If
+     successful, the timestamp will be stored into ``*_ts`` and true will be
+     returned; false will be returned otherwise.
+
+ (#) Get remote client epoch::
+
+	u32 rxrpc_kernel_get_epoch(struct socket *sock,
+				   struct rxrpc_call *call)
+
+     This allows the epoch that's contained in packets of an incoming client
+     call to be queried.  This value is returned.  The function always
+     successful if the call is still in progress.  It shouldn't be called once
+     the call has expired.  Note that calling this on a local client call only
+     returns the local epoch.
+
+     This value can be used to determine if the remote client has been
+     restarted as it shouldn't change otherwise.
+
+ (#) Set the maxmimum lifespan on a call::
+
+	void rxrpc_kernel_set_max_life(struct socket *sock,
+				       struct rxrpc_call *call,
+				       unsigned long hard_timeout)
+
+     This sets the maximum lifespan on a call to hard_timeout (which is in
+     jiffies).  In the event of the timeout occurring, the call will be
+     aborted and -ETIME or -ETIMEDOUT will be returned.
+
+
+Configurable Parameters
+=======================
+
+The RxRPC protocol driver has a number of configurable parameters that can be
+adjusted through sysctls in /proc/net/rxrpc/:
+
+ (#) req_ack_delay
+
+     The amount of time in milliseconds after receiving a packet with the
+     request-ack flag set before we honour the flag and actually send the
+     requested ack.
+
+     Usually the other side won't stop sending packets until the advertised
+     reception window is full (to a maximum of 255 packets), so delaying the
+     ACK permits several packets to be ACK'd in one go.
+
+ (#) soft_ack_delay
+
+     The amount of time in milliseconds after receiving a new packet before we
+     generate a soft-ACK to tell the sender that it doesn't need to resend.
+
+ (#) idle_ack_delay
+
+     The amount of time in milliseconds after all the packets currently in the
+     received queue have been consumed before we generate a hard-ACK to tell
+     the sender it can free its buffers, assuming no other reason occurs that
+     we would send an ACK.
+
+ (#) resend_timeout
+
+     The amount of time in milliseconds after transmitting a packet before we
+     transmit it again, assuming no ACK is received from the receiver telling
+     us they got it.
+
+ (#) max_call_lifetime
+
+     The maximum amount of time in seconds that a call may be in progress
+     before we preemptively kill it.
+
+ (#) dead_call_expiry
+
+     The amount of time in seconds before we remove a dead call from the call
+     list.  Dead calls are kept around for a little while for the purpose of
+     repeating ACK and ABORT packets.
+
+ (#) connection_expiry
+
+     The amount of time in seconds after a connection was last used before we
+     remove it from the connection list.  While a connection is in existence,
+     it serves as a placeholder for negotiated security; when it is deleted,
+     the security must be renegotiated.
+
+ (#) transport_expiry
+
+     The amount of time in seconds after a transport was last used before we
+     remove it from the transport list.  While a transport is in existence, it
+     serves to anchor the peer data and keeps the connection ID counter.
+
+ (#) rxrpc_rx_window_size
+
+     The size of the receive window in packets.  This is the maximum number of
+     unconsumed received packets we're willing to hold in memory for any
+     particular call.
+
+ (#) rxrpc_rx_mtu
+
+     The maximum packet MTU size that we're willing to receive in bytes.  This
+     indicates to the peer whether we're willing to accept jumbo packets.
+
+ (#) rxrpc_rx_jumbo_max
+
+     The maximum number of packets that we're willing to accept in a jumbo
+     packet.  Non-terminal packets in a jumbo packet must contain a four byte
+     header plus exactly 1412 bytes of data.  The terminal packet must contain
+     a four byte header plus any amount of data.  In any event, a jumbo packet
+     may not exceed rxrpc_rx_mtu in size.
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
deleted file mode 100644
index 180e07d956a7..000000000000
--- a/Documentation/networking/rxrpc.txt
+++ /dev/null
@@ -1,1155 +0,0 @@
-			    ======================
-			    RxRPC NETWORK PROTOCOL
-			    ======================
-
-The RxRPC protocol driver provides a reliable two-phase transport on top of UDP
-that can be used to perform RxRPC remote operations.  This is done over sockets
-of AF_RXRPC family, using sendmsg() and recvmsg() with control data to send and
-receive data, aborts and errors.
-
-Contents of this document:
-
- (*) Overview.
-
- (*) RxRPC protocol summary.
-
- (*) AF_RXRPC driver model.
-
- (*) Control messages.
-
- (*) Socket options.
-
- (*) Security.
-
- (*) Example client usage.
-
- (*) Example server usage.
-
- (*) AF_RXRPC kernel interface.
-
- (*) Configurable parameters.
-
-
-========
-OVERVIEW
-========
-
-RxRPC is a two-layer protocol.  There is a session layer which provides
-reliable virtual connections using UDP over IPv4 (or IPv6) as the transport
-layer, but implements a real network protocol; and there's the presentation
-layer which renders structured data to binary blobs and back again using XDR
-(as does SunRPC):
-
-		+-------------+
-		| Application |
-		+-------------+
-		|     XDR     |		Presentation
-		+-------------+
-		|    RxRPC    |		Session
-		+-------------+
-		|     UDP     |		Transport
-		+-------------+
-
-
-AF_RXRPC provides:
-
- (1) Part of an RxRPC facility for both kernel and userspace applications by
-     making the session part of it a Linux network protocol (AF_RXRPC).
-
- (2) A two-phase protocol.  The client transmits a blob (the request) and then
-     receives a blob (the reply), and the server receives the request and then
-     transmits the reply.
-
- (3) Retention of the reusable bits of the transport system set up for one call
-     to speed up subsequent calls.
-
- (4) A secure protocol, using the Linux kernel's key retention facility to
-     manage security on the client end.  The server end must of necessity be
-     more active in security negotiations.
-
-AF_RXRPC does not provide XDR marshalling/presentation facilities.  That is
-left to the application.  AF_RXRPC only deals in blobs.  Even the operation ID
-is just the first four bytes of the request blob, and as such is beyond the
-kernel's interest.
-
-
-Sockets of AF_RXRPC family are:
-
- (1) created as type SOCK_DGRAM;
-
- (2) provided with a protocol of the type of underlying transport they're going
-     to use - currently only PF_INET is supported.
-
-
-The Andrew File System (AFS) is an example of an application that uses this and
-that has both kernel (filesystem) and userspace (utility) components.
-
-
-======================
-RXRPC PROTOCOL SUMMARY
-======================
-
-An overview of the RxRPC protocol:
-
- (*) RxRPC sits on top of another networking protocol (UDP is the only option
-     currently), and uses this to provide network transport.  UDP ports, for
-     example, provide transport endpoints.
-
- (*) RxRPC supports multiple virtual "connections" from any given transport
-     endpoint, thus allowing the endpoints to be shared, even to the same
-     remote endpoint.
-
- (*) Each connection goes to a particular "service".  A connection may not go
-     to multiple services.  A service may be considered the RxRPC equivalent of
-     a port number.  AF_RXRPC permits multiple services to share an endpoint.
-
- (*) Client-originating packets are marked, thus a transport endpoint can be
-     shared between client and server connections (connections have a
-     direction).
-
- (*) Up to a billion connections may be supported concurrently between one
-     local transport endpoint and one service on one remote endpoint.  An RxRPC
-     connection is described by seven numbers:
-
-	Local address	}
-	Local port	} Transport (UDP) address
-	Remote address	}
-	Remote port	}
-	Direction
-	Connection ID
-	Service ID
-
- (*) Each RxRPC operation is a "call".  A connection may make up to four
-     billion calls, but only up to four calls may be in progress on a
-     connection at any one time.
-
- (*) Calls are two-phase and asymmetric: the client sends its request data,
-     which the service receives; then the service transmits the reply data
-     which the client receives.
-
- (*) The data blobs are of indefinite size, the end of a phase is marked with a
-     flag in the packet.  The number of packets of data making up one blob may
-     not exceed 4 billion, however, as this would cause the sequence number to
-     wrap.
-
- (*) The first four bytes of the request data are the service operation ID.
-
- (*) Security is negotiated on a per-connection basis.  The connection is
-     initiated by the first data packet on it arriving.  If security is
-     requested, the server then issues a "challenge" and then the client
-     replies with a "response".  If the response is successful, the security is
-     set for the lifetime of that connection, and all subsequent calls made
-     upon it use that same security.  In the event that the server lets a
-     connection lapse before the client, the security will be renegotiated if
-     the client uses the connection again.
-
- (*) Calls use ACK packets to handle reliability.  Data packets are also
-     explicitly sequenced per call.
-
- (*) There are two types of positive acknowledgment: hard-ACKs and soft-ACKs.
-     A hard-ACK indicates to the far side that all the data received to a point
-     has been received and processed; a soft-ACK indicates that the data has
-     been received but may yet be discarded and re-requested.  The sender may
-     not discard any transmittable packets until they've been hard-ACK'd.
-
- (*) Reception of a reply data packet implicitly hard-ACK's all the data
-     packets that make up the request.
-
- (*) An call is complete when the request has been sent, the reply has been
-     received and the final hard-ACK on the last packet of the reply has
-     reached the server.
-
- (*) An call may be aborted by either end at any time up to its completion.
-
-
-=====================
-AF_RXRPC DRIVER MODEL
-=====================
-
-About the AF_RXRPC driver:
-
- (*) The AF_RXRPC protocol transparently uses internal sockets of the transport
-     protocol to represent transport endpoints.
-
- (*) AF_RXRPC sockets map onto RxRPC connection bundles.  Actual RxRPC
-     connections are handled transparently.  One client socket may be used to
-     make multiple simultaneous calls to the same service.  One server socket
-     may handle calls from many clients.
-
- (*) Additional parallel client connections will be initiated to support extra
-     concurrent calls, up to a tunable limit.
-
- (*) Each connection is retained for a certain amount of time [tunable] after
-     the last call currently using it has completed in case a new call is made
-     that could reuse it.
-
- (*) Each internal UDP socket is retained [tunable] for a certain amount of
-     time [tunable] after the last connection using it discarded, in case a new
-     connection is made that could use it.
-
- (*) A client-side connection is only shared between calls if they have have
-     the same key struct describing their security (and assuming the calls
-     would otherwise share the connection).  Non-secured calls would also be
-     able to share connections with each other.
-
- (*) A server-side connection is shared if the client says it is.
-
- (*) ACK'ing is handled by the protocol driver automatically, including ping
-     replying.
-
- (*) SO_KEEPALIVE automatically pings the other side to keep the connection
-     alive [TODO].
-
- (*) If an ICMP error is received, all calls affected by that error will be
-     aborted with an appropriate network error passed through recvmsg().
-
-
-Interaction with the user of the RxRPC socket:
-
- (*) A socket is made into a server socket by binding an address with a
-     non-zero service ID.
-
- (*) In the client, sending a request is achieved with one or more sendmsgs,
-     followed by the reply being received with one or more recvmsgs.
-
- (*) The first sendmsg for a request to be sent from a client contains a tag to
-     be used in all other sendmsgs or recvmsgs associated with that call.  The
-     tag is carried in the control data.
-
- (*) connect() is used to supply a default destination address for a client
-     socket.  This may be overridden by supplying an alternate address to the
-     first sendmsg() of a call (struct msghdr::msg_name).
-
- (*) If connect() is called on an unbound client, a random local port will
-     bound before the operation takes place.
-
- (*) A server socket may also be used to make client calls.  To do this, the
-     first sendmsg() of the call must specify the target address.  The server's
-     transport endpoint is used to send the packets.
-
- (*) Once the application has received the last message associated with a call,
-     the tag is guaranteed not to be seen again, and so it can be used to pin
-     client resources.  A new call can then be initiated with the same tag
-     without fear of interference.
-
- (*) In the server, a request is received with one or more recvmsgs, then the
-     the reply is transmitted with one or more sendmsgs, and then the final ACK
-     is received with a last recvmsg.
-
- (*) When sending data for a call, sendmsg is given MSG_MORE if there's more
-     data to come on that call.
-
- (*) When receiving data for a call, recvmsg flags MSG_MORE if there's more
-     data to come for that call.
-
- (*) When receiving data or messages for a call, MSG_EOR is flagged by recvmsg
-     to indicate the terminal message for that call.
-
- (*) A call may be aborted by adding an abort control message to the control
-     data.  Issuing an abort terminates the kernel's use of that call's tag.
-     Any messages waiting in the receive queue for that call will be discarded.
-
- (*) Aborts, busy notifications and challenge packets are delivered by recvmsg,
-     and control data messages will be set to indicate the context.  Receiving
-     an abort or a busy message terminates the kernel's use of that call's tag.
-
- (*) The control data part of the msghdr struct is used for a number of things:
-
-     (*) The tag of the intended or affected call.
-
-     (*) Sending or receiving errors, aborts and busy notifications.
-
-     (*) Notifications of incoming calls.
-
-     (*) Sending debug requests and receiving debug replies [TODO].
-
- (*) When the kernel has received and set up an incoming call, it sends a
-     message to server application to let it know there's a new call awaiting
-     its acceptance [recvmsg reports a special control message].  The server
-     application then uses sendmsg to assign a tag to the new call.  Once that
-     is done, the first part of the request data will be delivered by recvmsg.
-
- (*) The server application has to provide the server socket with a keyring of
-     secret keys corresponding to the security types it permits.  When a secure
-     connection is being set up, the kernel looks up the appropriate secret key
-     in the keyring and then sends a challenge packet to the client and
-     receives a response packet.  The kernel then checks the authorisation of
-     the packet and either aborts the connection or sets up the security.
-
- (*) The name of the key a client will use to secure its communications is
-     nominated by a socket option.
-
-
-Notes on sendmsg:
-
- (*) MSG_WAITALL can be set to tell sendmsg to ignore signals if the peer is
-     making progress at accepting packets within a reasonable time such that we
-     manage to queue up all the data for transmission.  This requires the
-     client to accept at least one packet per 2*RTT time period.
-
-     If this isn't set, sendmsg() will return immediately, either returning
-     EINTR/ERESTARTSYS if nothing was consumed or returning the amount of data
-     consumed.
-
-
-Notes on recvmsg:
-
- (*) If there's a sequence of data messages belonging to a particular call on
-     the receive queue, then recvmsg will keep working through them until:
-
-     (a) it meets the end of that call's received data,
-
-     (b) it meets a non-data message,
-
-     (c) it meets a message belonging to a different call, or
-
-     (d) it fills the user buffer.
-
-     If recvmsg is called in blocking mode, it will keep sleeping, awaiting the
-     reception of further data, until one of the above four conditions is met.
-
- (2) MSG_PEEK operates similarly, but will return immediately if it has put any
-     data in the buffer rather than sleeping until it can fill the buffer.
-
- (3) If a data message is only partially consumed in filling a user buffer,
-     then the remainder of that message will be left on the front of the queue
-     for the next taker.  MSG_TRUNC will never be flagged.
-
- (4) If there is more data to be had on a call (it hasn't copied the last byte
-     of the last data message in that phase yet), then MSG_MORE will be
-     flagged.
-
-
-================
-CONTROL MESSAGES
-================
-
-AF_RXRPC makes use of control messages in sendmsg() and recvmsg() to multiplex
-calls, to invoke certain actions and to report certain conditions.  These are:
-
-	MESSAGE ID		SRT DATA	MEANING
-	=======================	=== ===========	===============================
-	RXRPC_USER_CALL_ID	sr- User ID	App's call specifier
-	RXRPC_ABORT		srt Abort code	Abort code to issue/received
-	RXRPC_ACK		-rt n/a		Final ACK received
-	RXRPC_NET_ERROR		-rt error num	Network error on call
-	RXRPC_BUSY		-rt n/a		Call rejected (server busy)
-	RXRPC_LOCAL_ERROR	-rt error num	Local error encountered
-	RXRPC_NEW_CALL		-r- n/a		New call received
-	RXRPC_ACCEPT		s-- n/a		Accept new call
-	RXRPC_EXCLUSIVE_CALL	s-- n/a		Make an exclusive client call
-	RXRPC_UPGRADE_SERVICE	s-- n/a		Client call can be upgraded
-	RXRPC_TX_LENGTH		s-- data len	Total length of Tx data
-
-	(SRT = usable in Sendmsg / delivered by Recvmsg / Terminal message)
-
- (*) RXRPC_USER_CALL_ID
-
-     This is used to indicate the application's call ID.  It's an unsigned long
-     that the app specifies in the client by attaching it to the first data
-     message or in the server by passing it in association with an RXRPC_ACCEPT
-     message.  recvmsg() passes it in conjunction with all messages except
-     those of the RXRPC_NEW_CALL message.
-
- (*) RXRPC_ABORT
-
-     This is can be used by an application to abort a call by passing it to
-     sendmsg, or it can be delivered by recvmsg to indicate a remote abort was
-     received.  Either way, it must be associated with an RXRPC_USER_CALL_ID to
-     specify the call affected.  If an abort is being sent, then error EBADSLT
-     will be returned if there is no call with that user ID.
-
- (*) RXRPC_ACK
-
-     This is delivered to a server application to indicate that the final ACK
-     of a call was received from the client.  It will be associated with an
-     RXRPC_USER_CALL_ID to indicate the call that's now complete.
-
- (*) RXRPC_NET_ERROR
-
-     This is delivered to an application to indicate that an ICMP error message
-     was encountered in the process of trying to talk to the peer.  An
-     errno-class integer value will be included in the control message data
-     indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
-     affected.
-
- (*) RXRPC_BUSY
-
-     This is delivered to a client application to indicate that a call was
-     rejected by the server due to the server being busy.  It will be
-     associated with an RXRPC_USER_CALL_ID to indicate the rejected call.
-
- (*) RXRPC_LOCAL_ERROR
-
-     This is delivered to an application to indicate that a local error was
-     encountered and that a call has been aborted because of it.  An
-     errno-class integer value will be included in the control message data
-     indicating the problem, and an RXRPC_USER_CALL_ID will indicate the call
-     affected.
-
- (*) RXRPC_NEW_CALL
-
-     This is delivered to indicate to a server application that a new call has
-     arrived and is awaiting acceptance.  No user ID is associated with this,
-     as a user ID must subsequently be assigned by doing an RXRPC_ACCEPT.
-
- (*) RXRPC_ACCEPT
-
-     This is used by a server application to attempt to accept a call and
-     assign it a user ID.  It should be associated with an RXRPC_USER_CALL_ID
-     to indicate the user ID to be assigned.  If there is no call to be
-     accepted (it may have timed out, been aborted, etc.), then sendmsg will
-     return error ENODATA.  If the user ID is already in use by another call,
-     then error EBADSLT will be returned.
-
- (*) RXRPC_EXCLUSIVE_CALL
-
-     This is used to indicate that a client call should be made on a one-off
-     connection.  The connection is discarded once the call has terminated.
-
- (*) RXRPC_UPGRADE_SERVICE
-
-     This is used to make a client call to probe if the specified service ID
-     may be upgraded by the server.  The caller must check msg_name returned to
-     recvmsg() for the service ID actually in use.  The operation probed must
-     be one that takes the same arguments in both services.
-
-     Once this has been used to establish the upgrade capability (or lack
-     thereof) of the server, the service ID returned should be used for all
-     future communication to that server and RXRPC_UPGRADE_SERVICE should no
-     longer be set.
-
- (*) RXRPC_TX_LENGTH
-
-     This is used to inform the kernel of the total amount of data that is
-     going to be transmitted by a call (whether in a client request or a
-     service response).  If given, it allows the kernel to encrypt from the
-     userspace buffer directly to the packet buffers, rather than copying into
-     the buffer and then encrypting in place.  This may only be given with the
-     first sendmsg() providing data for a call.  EMSGSIZE will be generated if
-     the amount of data actually given is different.
-
-     This takes a parameter of __s64 type that indicates how much will be
-     transmitted.  This may not be less than zero.
-
-The symbol RXRPC__SUPPORTED is defined as one more than the highest control
-message type supported.  At run time this can be queried by means of the
-RXRPC_SUPPORTED_CMSG socket option (see below).
-
-
-==============
-SOCKET OPTIONS
-==============
-
-AF_RXRPC sockets support a few socket options at the SOL_RXRPC level:
-
- (*) RXRPC_SECURITY_KEY
-
-     This is used to specify the description of the key to be used.  The key is
-     extracted from the calling process's keyrings with request_key() and
-     should be of "rxrpc" type.
-
-     The optval pointer points to the description string, and optlen indicates
-     how long the string is, without the NUL terminator.
-
- (*) RXRPC_SECURITY_KEYRING
-
-     Similar to above but specifies a keyring of server secret keys to use (key
-     type "keyring").  See the "Security" section.
-
- (*) RXRPC_EXCLUSIVE_CONNECTION
-
-     This is used to request that new connections should be used for each call
-     made subsequently on this socket.  optval should be NULL and optlen 0.
-
- (*) RXRPC_MIN_SECURITY_LEVEL
-
-     This is used to specify the minimum security level required for calls on
-     this socket.  optval must point to an int containing one of the following
-     values:
-
-     (a) RXRPC_SECURITY_PLAIN
-
-	 Encrypted checksum only.
-
-     (b) RXRPC_SECURITY_AUTH
-
-	 Encrypted checksum plus packet padded and first eight bytes of packet
-	 encrypted - which includes the actual packet length.
-
-     (c) RXRPC_SECURITY_ENCRYPTED
-
-	 Encrypted checksum plus entire packet padded and encrypted, including
-	 actual packet length.
-
- (*) RXRPC_UPGRADEABLE_SERVICE
-
-     This is used to indicate that a service socket with two bindings may
-     upgrade one bound service to the other if requested by the client.  optval
-     must point to an array of two unsigned short ints.  The first is the
-     service ID to upgrade from and the second the service ID to upgrade to.
-
- (*) RXRPC_SUPPORTED_CMSG
-
-     This is a read-only option that writes an int into the buffer indicating
-     the highest control message type supported.
-
-
-========
-SECURITY
-========
-
-Currently, only the kerberos 4 equivalent protocol has been implemented
-(security index 2 - rxkad).  This requires the rxkad module to be loaded and,
-on the client, tickets of the appropriate type to be obtained from the AFS
-kaserver or the kerberos server and installed as "rxrpc" type keys.  This is
-normally done using the klog program.  An example simple klog program can be
-found at:
-
-	http://people.redhat.com/~dhowells/rxrpc/klog.c
-
-The payload provided to add_key() on the client should be of the following
-form:
-
-	struct rxrpc_key_sec2_v1 {
-		uint16_t	security_index;	/* 2 */
-		uint16_t	ticket_length;	/* length of ticket[] */
-		uint32_t	expiry;		/* time at which expires */
-		uint8_t		kvno;		/* key version number */
-		uint8_t		__pad[3];
-		uint8_t		session_key[8];	/* DES session key */
-		uint8_t		ticket[0];	/* the encrypted ticket */
-	};
-
-Where the ticket blob is just appended to the above structure.
-
-
-For the server, keys of type "rxrpc_s" must be made available to the server.
-They have a description of "<serviceID>:<securityIndex>" (eg: "52:2" for an
-rxkad key for the AFS VL service).  When such a key is created, it should be
-given the server's secret key as the instantiation data (see the example
-below).
-
-	add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
-
-A keyring is passed to the server socket by naming it in a sockopt.  The server
-socket then looks the server secret keys up in this keyring when secure
-incoming connections are made.  This can be seen in an example program that can
-be found at:
-
-	http://people.redhat.com/~dhowells/rxrpc/listen.c
-
-
-====================
-EXAMPLE CLIENT USAGE
-====================
-
-A client would issue an operation by:
-
- (1) An RxRPC socket is set up by:
-
-	client = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
-
-     Where the third parameter indicates the protocol family of the transport
-     socket used - usually IPv4 but it can also be IPv6 [TODO].
-
- (2) A local address can optionally be bound:
-
-	struct sockaddr_rxrpc srx = {
-		.srx_family	= AF_RXRPC,
-		.srx_service	= 0,  /* we're a client */
-		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
-		.transport.sin_family	= AF_INET,
-		.transport.sin_port	= htons(7000), /* AFS callback */
-		.transport.sin_address	= 0,  /* all local interfaces */
-	};
-	bind(client, &srx, sizeof(srx));
-
-     This specifies the local UDP port to be used.  If not given, a random
-     non-privileged port will be used.  A UDP port may be shared between
-     several unrelated RxRPC sockets.  Security is handled on a basis of
-     per-RxRPC virtual connection.
-
- (3) The security is set:
-
-	const char *key = "AFS:cambridge.redhat.com";
-	setsockopt(client, SOL_RXRPC, RXRPC_SECURITY_KEY, key, strlen(key));
-
-     This issues a request_key() to get the key representing the security
-     context.  The minimum security level can be set:
-
-	unsigned int sec = RXRPC_SECURITY_ENCRYPTED;
-	setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
-		   &sec, sizeof(sec));
-
- (4) The server to be contacted can then be specified (alternatively this can
-     be done through sendmsg):
-
-	struct sockaddr_rxrpc srx = {
-		.srx_family	= AF_RXRPC,
-		.srx_service	= VL_SERVICE_ID,
-		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
-		.transport.sin_family	= AF_INET,
-		.transport.sin_port	= htons(7005), /* AFS volume manager */
-		.transport.sin_address	= ...,
-	};
-	connect(client, &srx, sizeof(srx));
-
- (5) The request data should then be posted to the server socket using a series
-     of sendmsg() calls, each with the following control message attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-
-     MSG_MORE should be set in msghdr::msg_flags on all but the last part of
-     the request.  Multiple requests may be made simultaneously.
-
-     An RXRPC_TX_LENGTH control message can also be specified on the first
-     sendmsg() call.
-
-     If a call is intended to go to a destination other than the default
-     specified through connect(), then msghdr::msg_name should be set on the
-     first request message of that call.
-
- (6) The reply data will then be posted to the server socket for recvmsg() to
-     pick up.  MSG_MORE will be flagged by recvmsg() if there's more reply data
-     for a particular call to be read.  MSG_EOR will be set on the terminal
-     read for a call.
-
-     All data will be delivered with the following control message attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-
-     If an abort or error occurred, this will be returned in the control data
-     buffer instead, and MSG_EOR will be flagged to indicate the end of that
-     call.
-
-A client may ask for a service ID it knows and ask that this be upgraded to a
-better service if one is available by supplying RXRPC_UPGRADE_SERVICE on the
-first sendmsg() of a call.  The client should then check srx_service in the
-msg_name filled in by recvmsg() when collecting the result.  srx_service will
-hold the same value as given to sendmsg() if the upgrade request was ignored by
-the service - otherwise it will be altered to indicate the service ID the
-server upgraded to.  Note that the upgraded service ID is chosen by the server.
-The caller has to wait until it sees the service ID in the reply before sending
-any more calls (further calls to the same destination will be blocked until the
-probe is concluded).
-
-
-====================
-EXAMPLE SERVER USAGE
-====================
-
-A server would be set up to accept operations in the following manner:
-
- (1) An RxRPC socket is created by:
-
-	server = socket(AF_RXRPC, SOCK_DGRAM, PF_INET);
-
-     Where the third parameter indicates the address type of the transport
-     socket used - usually IPv4.
-
- (2) Security is set up if desired by giving the socket a keyring with server
-     secret keys in it:
-
-	keyring = add_key("keyring", "AFSkeys", NULL, 0,
-			  KEY_SPEC_PROCESS_KEYRING);
-
-	const char secret_key[8] = {
-		0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 };
-	add_key("rxrpc_s", "52:2", secret_key, 8, keyring);
-
-	setsockopt(server, SOL_RXRPC, RXRPC_SECURITY_KEYRING, "AFSkeys", 7);
-
-     The keyring can be manipulated after it has been given to the socket. This
-     permits the server to add more keys, replace keys, etc. while it is live.
-
- (3) A local address must then be bound:
-
-	struct sockaddr_rxrpc srx = {
-		.srx_family	= AF_RXRPC,
-		.srx_service	= VL_SERVICE_ID, /* RxRPC service ID */
-		.transport_type	= SOCK_DGRAM,	/* type of transport socket */
-		.transport.sin_family	= AF_INET,
-		.transport.sin_port	= htons(7000), /* AFS callback */
-		.transport.sin_address	= 0,  /* all local interfaces */
-	};
-	bind(server, &srx, sizeof(srx));
-
-     More than one service ID may be bound to a socket, provided the transport
-     parameters are the same.  The limit is currently two.  To do this, bind()
-     should be called twice.
-
- (4) If service upgrading is required, first two service IDs must have been
-     bound and then the following option must be set:
-
-	unsigned short service_ids[2] = { from_ID, to_ID };
-	setsockopt(server, SOL_RXRPC, RXRPC_UPGRADEABLE_SERVICE,
-		   service_ids, sizeof(service_ids));
-
-     This will automatically upgrade connections on service from_ID to service
-     to_ID if they request it.  This will be reflected in msg_name obtained
-     through recvmsg() when the request data is delivered to userspace.
-
- (5) The server is then set to listen out for incoming calls:
-
-	listen(server, 100);
-
- (6) The kernel notifies the server of pending incoming connections by sending
-     it a message for each.  This is received with recvmsg() on the server
-     socket.  It has no data, and has a single dataless control message
-     attached:
-
-	RXRPC_NEW_CALL
-
-     The address that can be passed back by recvmsg() at this point should be
-     ignored since the call for which the message was posted may have gone by
-     the time it is accepted - in which case the first call still on the queue
-     will be accepted.
-
- (7) The server then accepts the new call by issuing a sendmsg() with two
-     pieces of control data and no actual data:
-
-	RXRPC_ACCEPT		- indicate connection acceptance
-	RXRPC_USER_CALL_ID	- specify user ID for this call
-
- (8) The first request data packet will then be posted to the server socket for
-     recvmsg() to pick up.  At that point, the RxRPC address for the call can
-     be read from the address fields in the msghdr struct.
-
-     Subsequent request data will be posted to the server socket for recvmsg()
-     to collect as it arrives.  All but the last piece of the request data will
-     be delivered with MSG_MORE flagged.
-
-     All data will be delivered with the following control message attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-
- (9) The reply data should then be posted to the server socket using a series
-     of sendmsg() calls, each with the following control messages attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-
-     MSG_MORE should be set in msghdr::msg_flags on all but the last message
-     for a particular call.
-
-(10) The final ACK from the client will be posted for retrieval by recvmsg()
-     when it is received.  It will take the form of a dataless message with two
-     control messages attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-	RXRPC_ACK		- indicates final ACK (no data)
-
-     MSG_EOR will be flagged to indicate that this is the final message for
-     this call.
-
-(11) Up to the point the final packet of reply data is sent, the call can be
-     aborted by calling sendmsg() with a dataless message with the following
-     control messages attached:
-
-	RXRPC_USER_CALL_ID	- specifies the user ID for this call
-	RXRPC_ABORT		- indicates abort code (4 byte data)
-
-     Any packets waiting in the socket's receive queue will be discarded if
-     this is issued.
-
-Note that all the communications for a particular service take place through
-the one server socket, using control messages on sendmsg() and recvmsg() to
-determine the call affected.
-
-
-=========================
-AF_RXRPC KERNEL INTERFACE
-=========================
-
-The AF_RXRPC module also provides an interface for use by in-kernel utilities
-such as the AFS filesystem.  This permits such a utility to:
-
- (1) Use different keys directly on individual client calls on one socket
-     rather than having to open a whole slew of sockets, one for each key it
-     might want to use.
-
- (2) Avoid having RxRPC call request_key() at the point of issue of a call or
-     opening of a socket.  Instead the utility is responsible for requesting a
-     key at the appropriate point.  AFS, for instance, would do this during VFS
-     operations such as open() or unlink().  The key is then handed through
-     when the call is initiated.
-
- (3) Request the use of something other than GFP_KERNEL to allocate memory.
-
- (4) Avoid the overhead of using the recvmsg() call.  RxRPC messages can be
-     intercepted before they get put into the socket Rx queue and the socket
-     buffers manipulated directly.
-
-To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
-bind an address as appropriate and listen if it's to be a server socket, but
-then it passes this to the kernel interface functions.
-
-The kernel interface functions are as follows:
-
- (*) Begin a new client call.
-
-	struct rxrpc_call *
-	rxrpc_kernel_begin_call(struct socket *sock,
-				struct sockaddr_rxrpc *srx,
-				struct key *key,
-				unsigned long user_call_ID,
-				s64 tx_total_len,
-				gfp_t gfp,
-				rxrpc_notify_rx_t notify_rx,
-				bool upgrade,
-				bool intr,
-				unsigned int debug_id);
-
-     This allocates the infrastructure to make a new RxRPC call and assigns
-     call and connection numbers.  The call will be made on the UDP port that
-     the socket is bound to.  The call will go to the destination address of a
-     connected client socket unless an alternative is supplied (srx is
-     non-NULL).
-
-     If a key is supplied then this will be used to secure the call instead of
-     the key bound to the socket with the RXRPC_SECURITY_KEY sockopt.  Calls
-     secured in this way will still share connections if at all possible.
-
-     The user_call_ID is equivalent to that supplied to sendmsg() in the
-     control data buffer.  It is entirely feasible to use this to point to a
-     kernel data structure.
-
-     tx_total_len is the amount of data the caller is intending to transmit
-     with this call (or -1 if unknown at this point).  Setting the data size
-     allows the kernel to encrypt directly to the packet buffers, thereby
-     saving a copy.  The value may not be less than -1.
-
-     notify_rx is a pointer to a function to be called when events such as
-     incoming data packets or remote aborts happen.
-
-     upgrade should be set to true if a client operation should request that
-     the server upgrade the service to a better one.  The resultant service ID
-     is returned by rxrpc_kernel_recv_data().
-
-     intr should be set to true if the call should be interruptible.  If this
-     is not set, this function may not return until a channel has been
-     allocated; if it is set, the function may return -ERESTARTSYS.
-
-     debug_id is the call debugging ID to be used for tracing.  This can be
-     obtained by atomically incrementing rxrpc_debug_id.
-
-     If this function is successful, an opaque reference to the RxRPC call is
-     returned.  The caller now holds a reference on this and it must be
-     properly ended.
-
- (*) End a client call.
-
-	void rxrpc_kernel_end_call(struct socket *sock,
-				   struct rxrpc_call *call);
-
-     This is used to end a previously begun call.  The user_call_ID is expunged
-     from AF_RXRPC's knowledge and will not be seen again in association with
-     the specified call.
-
- (*) Send data through a call.
-
-	typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk,
-					      unsigned long user_call_ID,
-					      struct sk_buff *skb);
-
-	int rxrpc_kernel_send_data(struct socket *sock,
-				   struct rxrpc_call *call,
-				   struct msghdr *msg,
-				   size_t len,
-				   rxrpc_notify_end_tx_t notify_end_rx);
-
-     This is used to supply either the request part of a client call or the
-     reply part of a server call.  msg.msg_iovlen and msg.msg_iov specify the
-     data buffers to be used.  msg_iov may not be NULL and must point
-     exclusively to in-kernel virtual addresses.  msg.msg_flags may be given
-     MSG_MORE if there will be subsequent data sends for this call.
-
-     The msg must not specify a destination address, control data or any flags
-     other than MSG_MORE.  len is the total amount of data to transmit.
-
-     notify_end_rx can be NULL or it can be used to specify a function to be
-     called when the call changes state to end the Tx phase.  This function is
-     called with the call-state spinlock held to prevent any reply or final ACK
-     from being delivered first.
-
- (*) Receive data from a call.
-
-	int rxrpc_kernel_recv_data(struct socket *sock,
-				   struct rxrpc_call *call,
-				   void *buf,
-				   size_t size,
-				   size_t *_offset,
-				   bool want_more,
-				   u32 *_abort,
-				   u16 *_service)
-
-      This is used to receive data from either the reply part of a client call
-      or the request part of a service call.  buf and size specify how much
-      data is desired and where to store it.  *_offset is added on to buf and
-      subtracted from size internally; the amount copied into the buffer is
-      added to *_offset before returning.
-
-      want_more should be true if further data will be required after this is
-      satisfied and false if this is the last item of the receive phase.
-
-      There are three normal returns: 0 if the buffer was filled and want_more
-      was true; 1 if the buffer was filled, the last DATA packet has been
-      emptied and want_more was false; and -EAGAIN if the function needs to be
-      called again.
-
-      If the last DATA packet is processed but the buffer contains less than
-      the amount requested, EBADMSG is returned.  If want_more wasn't set, but
-      more data was available, EMSGSIZE is returned.
-
-      If a remote ABORT is detected, the abort code received will be stored in
-      *_abort and ECONNABORTED will be returned.
-
-      The service ID that the call ended up with is returned into *_service.
-      This can be used to see if a call got a service upgrade.
-
- (*) Abort a call.
-
-	void rxrpc_kernel_abort_call(struct socket *sock,
-				     struct rxrpc_call *call,
-				     u32 abort_code);
-
-     This is used to abort a call if it's still in an abortable state.  The
-     abort code specified will be placed in the ABORT message sent.
-
- (*) Intercept received RxRPC messages.
-
-	typedef void (*rxrpc_interceptor_t)(struct sock *sk,
-					    unsigned long user_call_ID,
-					    struct sk_buff *skb);
-
-	void
-	rxrpc_kernel_intercept_rx_messages(struct socket *sock,
-					   rxrpc_interceptor_t interceptor);
-
-     This installs an interceptor function on the specified AF_RXRPC socket.
-     All messages that would otherwise wind up in the socket's Rx queue are
-     then diverted to this function.  Note that care must be taken to process
-     the messages in the right order to maintain DATA message sequentiality.
-
-     The interceptor function itself is provided with the address of the socket
-     and handling the incoming message, the ID assigned by the kernel utility
-     to the call and the socket buffer containing the message.
-
-     The skb->mark field indicates the type of message:
-
-	MARK				MEANING
-	===============================	=======================================
-	RXRPC_SKB_MARK_DATA		Data message
-	RXRPC_SKB_MARK_FINAL_ACK	Final ACK received for an incoming call
-	RXRPC_SKB_MARK_BUSY		Client call rejected as server busy
-	RXRPC_SKB_MARK_REMOTE_ABORT	Call aborted by peer
-	RXRPC_SKB_MARK_NET_ERROR	Network error detected
-	RXRPC_SKB_MARK_LOCAL_ERROR	Local error encountered
-	RXRPC_SKB_MARK_NEW_CALL		New incoming call awaiting acceptance
-
-     The remote abort message can be probed with rxrpc_kernel_get_abort_code().
-     The two error messages can be probed with rxrpc_kernel_get_error_number().
-     A new call can be accepted with rxrpc_kernel_accept_call().
-
-     Data messages can have their contents extracted with the usual bunch of
-     socket buffer manipulation functions.  A data message can be determined to
-     be the last one in a sequence with rxrpc_kernel_is_data_last().  When a
-     data message has been used up, rxrpc_kernel_data_consumed() should be
-     called on it.
-
-     Messages should be handled to rxrpc_kernel_free_skb() to dispose of.  It
-     is possible to get extra refs on all types of message for later freeing,
-     but this may pin the state of a call until the message is finally freed.
-
- (*) Accept an incoming call.
-
-	struct rxrpc_call *
-	rxrpc_kernel_accept_call(struct socket *sock,
-				 unsigned long user_call_ID);
-
-     This is used to accept an incoming call and to assign it a call ID.  This
-     function is similar to rxrpc_kernel_begin_call() and calls accepted must
-     be ended in the same way.
-
-     If this function is successful, an opaque reference to the RxRPC call is
-     returned.  The caller now holds a reference on this and it must be
-     properly ended.
-
- (*) Reject an incoming call.
-
-	int rxrpc_kernel_reject_call(struct socket *sock);
-
-     This is used to reject the first incoming call on the socket's queue with
-     a BUSY message.  -ENODATA is returned if there were no incoming calls.
-     Other errors may be returned if the call had been aborted (-ECONNABORTED)
-     or had timed out (-ETIME).
-
- (*) Allocate a null key for doing anonymous security.
-
-	struct key *rxrpc_get_null_key(const char *keyname);
-
-     This is used to allocate a null RxRPC key that can be used to indicate
-     anonymous security for a particular domain.
-
- (*) Get the peer address of a call.
-
-	void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
-				   struct sockaddr_rxrpc *_srx);
-
-     This is used to find the remote peer address of a call.
-
- (*) Set the total transmit data size on a call.
-
-	void rxrpc_kernel_set_tx_length(struct socket *sock,
-					struct rxrpc_call *call,
-					s64 tx_total_len);
-
-     This sets the amount of data that the caller is intending to transmit on a
-     call.  It's intended to be used for setting the reply size as the request
-     size should be set when the call is begun.  tx_total_len may not be less
-     than zero.
-
- (*) Get call RTT.
-
-	u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call);
-
-     Get the RTT time to the peer in use by a call.  The value returned is in
-     nanoseconds.
-
- (*) Check call still alive.
-
-	bool rxrpc_kernel_check_life(struct socket *sock,
-				     struct rxrpc_call *call,
-				     u32 *_life);
-	void rxrpc_kernel_probe_life(struct socket *sock,
-				     struct rxrpc_call *call);
-
-     The first function passes back in *_life a number that is updated when
-     ACKs are received from the peer (notably including PING RESPONSE ACKs
-     which we can elicit by sending PING ACKs to see if the call still exists
-     on the server).  The caller should compare the numbers of two calls to see
-     if the call is still alive after waiting for a suitable interval.  It also
-     returns true as long as the call hasn't yet reached the completed state.
-
-     This allows the caller to work out if the server is still contactable and
-     if the call is still alive on the server while waiting for the server to
-     process a client operation.
-
-     The second function causes a ping ACK to be transmitted to try to provoke
-     the peer into responding, which would then cause the value returned by the
-     first function to change.  Note that this must be called in TASK_RUNNING
-     state.
-
- (*) Get reply timestamp.
-
-	bool rxrpc_kernel_get_reply_time(struct socket *sock,
-					 struct rxrpc_call *call,
-					 ktime_t *_ts)
-
-     This allows the timestamp on the first DATA packet of the reply of a
-     client call to be queried, provided that it is still in the Rx ring.  If
-     successful, the timestamp will be stored into *_ts and true will be
-     returned; false will be returned otherwise.
-
- (*) Get remote client epoch.
-
-	u32 rxrpc_kernel_get_epoch(struct socket *sock,
-				   struct rxrpc_call *call)
-
-     This allows the epoch that's contained in packets of an incoming client
-     call to be queried.  This value is returned.  The function always
-     successful if the call is still in progress.  It shouldn't be called once
-     the call has expired.  Note that calling this on a local client call only
-     returns the local epoch.
-
-     This value can be used to determine if the remote client has been
-     restarted as it shouldn't change otherwise.
-
- (*) Set the maxmimum lifespan on a call.
-
-	void rxrpc_kernel_set_max_life(struct socket *sock,
-				       struct rxrpc_call *call,
-				       unsigned long hard_timeout)
-
-     This sets the maximum lifespan on a call to hard_timeout (which is in
-     jiffies).  In the event of the timeout occurring, the call will be
-     aborted and -ETIME or -ETIMEDOUT will be returned.
-
-
-=======================
-CONFIGURABLE PARAMETERS
-=======================
-
-The RxRPC protocol driver has a number of configurable parameters that can be
-adjusted through sysctls in /proc/net/rxrpc/:
-
- (*) req_ack_delay
-
-     The amount of time in milliseconds after receiving a packet with the
-     request-ack flag set before we honour the flag and actually send the
-     requested ack.
-
-     Usually the other side won't stop sending packets until the advertised
-     reception window is full (to a maximum of 255 packets), so delaying the
-     ACK permits several packets to be ACK'd in one go.
-
- (*) soft_ack_delay
-
-     The amount of time in milliseconds after receiving a new packet before we
-     generate a soft-ACK to tell the sender that it doesn't need to resend.
-
- (*) idle_ack_delay
-
-     The amount of time in milliseconds after all the packets currently in the
-     received queue have been consumed before we generate a hard-ACK to tell
-     the sender it can free its buffers, assuming no other reason occurs that
-     we would send an ACK.
-
- (*) resend_timeout
-
-     The amount of time in milliseconds after transmitting a packet before we
-     transmit it again, assuming no ACK is received from the receiver telling
-     us they got it.
-
- (*) max_call_lifetime
-
-     The maximum amount of time in seconds that a call may be in progress
-     before we preemptively kill it.
-
- (*) dead_call_expiry
-
-     The amount of time in seconds before we remove a dead call from the call
-     list.  Dead calls are kept around for a little while for the purpose of
-     repeating ACK and ABORT packets.
-
- (*) connection_expiry
-
-     The amount of time in seconds after a connection was last used before we
-     remove it from the connection list.  While a connection is in existence,
-     it serves as a placeholder for negotiated security; when it is deleted,
-     the security must be renegotiated.
-
- (*) transport_expiry
-
-     The amount of time in seconds after a transport was last used before we
-     remove it from the transport list.  While a transport is in existence, it
-     serves to anchor the peer data and keeps the connection ID counter.
-
- (*) rxrpc_rx_window_size
-
-     The size of the receive window in packets.  This is the maximum number of
-     unconsumed received packets we're willing to hold in memory for any
-     particular call.
-
- (*) rxrpc_rx_mtu
-
-     The maximum packet MTU size that we're willing to receive in bytes.  This
-     indicates to the peer whether we're willing to accept jumbo packets.
-
- (*) rxrpc_rx_jumbo_max
-
-     The maximum number of packets that we're willing to accept in a jumbo
-     packet.  Non-terminal packets in a jumbo packet must contain a four byte
-     header plus exactly 1412 bytes of data.  The terminal packet must contain
-     a four byte header plus any amount of data.  In any event, a jumbo packet
-     may not exceed rxrpc_rx_mtu in size.
diff --git a/MAINTAINERS b/MAINTAINERS
index b28823ab48c5..866a0dcd66ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14593,7 +14593,7 @@ M:	David Howells <dhowells@redhat.com>
 L:	linux-afs@lists.infradead.org
 S:	Supported
 W:	https://www.infradead.org/~dhowells/kafs/
-F:	Documentation/networking/rxrpc.txt
+F:	Documentation/networking/rxrpc.rst
 F:	include/keys/rxrpc-type.h
 F:	include/net/af_rxrpc.h
 F:	include/trace/events/rxrpc.h
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 57ebb29c26ad..d706bb408365 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -18,7 +18,7 @@ config AF_RXRPC
 	  This module at the moment only supports client operations and is
 	  currently incomplete.
 
-	  See Documentation/networking/rxrpc.txt.
+	  See Documentation/networking/rxrpc.rst.
 
 config AF_RXRPC_IPV6
 	bool "IPv6 support for RxRPC"
@@ -41,7 +41,7 @@ config AF_RXRPC_DEBUG
 	help
 	  Say Y here to make runtime controllable debugging messages appear.
 
-	  See Documentation/networking/rxrpc.txt.
+	  See Documentation/networking/rxrpc.rst.
 
 
 config RXKAD
@@ -56,4 +56,4 @@ config RXKAD
 	  Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC
 	  through the use of the key retention service.
 
-	  See Documentation/networking/rxrpc.txt.
+	  See Documentation/networking/rxrpc.rst.
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 2bbb38161851..174e903e18de 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -21,7 +21,7 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
 /*
  * RxRPC operating parameters.
  *
- * See Documentation/networking/rxrpc.txt and the variable definitions for more
+ * See Documentation/networking/rxrpc.rst and the variable definitions for more
  * information on the individual parameters.
  */
 static struct ctl_table rxrpc_sysctl_table[] = {
-- 
cgit v1.2.3


From 4ac0b122ee63d89b5aaf2e3e376092d8ac02a567 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 30 Apr 2020 18:04:32 +0200
Subject: docs: networking: convert tproxy.txt to ReST

- add SPDX header;
- adjust title markup;
- mark code blocks and literals as such;
- adjust identation, whitespaces and blank lines where needed;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst  |   1 +
 Documentation/networking/tproxy.rst | 109 ++++++++++++++++++++++++++++++++++++
 Documentation/networking/tproxy.txt | 104 ----------------------------------
 net/netfilter/Kconfig               |   2 +-
 4 files changed, 111 insertions(+), 105 deletions(-)
 create mode 100644 Documentation/networking/tproxy.rst
 delete mode 100644 Documentation/networking/tproxy.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 8f9a84b8e3f2..b423b2db5f96 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -110,6 +110,7 @@ Contents:
    tcp-thin
    team
    timestamping
+   tproxy
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst
new file mode 100644
index 000000000000..00dc3a1a66b4
--- /dev/null
+++ b/Documentation/networking/tproxy.rst
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+Transparent proxy support
+=========================
+
+This feature adds Linux 2.2-like transparent proxy support to current kernels.
+To use it, enable the socket match and the TPROXY target in your kernel config.
+You will need policy routing too, so be sure to enable that as well.
+
+From Linux 4.18 transparent proxy support is also available in nf_tables.
+
+1. Making non-local sockets work
+================================
+
+The idea is that you identify packets with destination address matching a local
+socket on your box, set the packet mark to a certain value::
+
+    # iptables -t mangle -N DIVERT
+    # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
+    # iptables -t mangle -A DIVERT -j MARK --set-mark 1
+    # iptables -t mangle -A DIVERT -j ACCEPT
+
+Alternatively you can do this in nft with the following commands::
+
+    # nft add table filter
+    # nft add chain filter divert "{ type filter hook prerouting priority -150; }"
+    # nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept
+
+And then match on that value using policy routing to have those packets
+delivered locally::
+
+    # ip rule add fwmark 1 lookup 100
+    # ip route add local 0.0.0.0/0 dev lo table 100
+
+Because of certain restrictions in the IPv4 routing output code you'll have to
+modify your application to allow it to send datagrams _from_ non-local IP
+addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket
+option before calling bind::
+
+    fd = socket(AF_INET, SOCK_STREAM, 0);
+    /* - 8< -*/
+    int value = 1;
+    setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value));
+    /* - 8< -*/
+    name.sin_family = AF_INET;
+    name.sin_port = htons(0xCAFE);
+    name.sin_addr.s_addr = htonl(0xDEADBEEF);
+    bind(fd, &name, sizeof(name));
+
+A trivial patch for netcat is available here:
+http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch
+
+
+2. Redirecting traffic
+======================
+
+Transparent proxying often involves "intercepting" traffic on a router. This is
+usually done with the iptables REDIRECT target; however, there are serious
+limitations of that method. One of the major issues is that it actually
+modifies the packets to change the destination address -- which might not be
+acceptable in certain situations. (Think of proxying UDP for example: you won't
+be able to find out the original destination address. Even in case of TCP
+getting the original destination address is racy.)
+
+The 'TPROXY' target provides similar functionality without relying on NAT. Simply
+add rules like this to the iptables ruleset above::
+
+    # iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
+      --tproxy-mark 0x1/0x1 --on-port 50080
+
+Or the following rule to nft:
+
+# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept
+
+Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
+IP_TRANSPARENT) for the listening socket.
+
+As an example implementation, tcprdr is available here:
+https://git.breakpoint.cc/cgit/fw/tcprdr.git/
+This tool is written by Florian Westphal and it was used for testing during the
+nf_tables implementation.
+
+3. Iptables and nf_tables extensions
+====================================
+
+To use tproxy you'll need to have the following modules compiled for iptables:
+
+ - NETFILTER_XT_MATCH_SOCKET
+ - NETFILTER_XT_TARGET_TPROXY
+
+Or the floowing modules for nf_tables:
+
+ - NFT_SOCKET
+ - NFT_TPROXY
+
+4. Application support
+======================
+
+4.1. Squid
+----------
+
+Squid 3.HEAD has support built-in. To use it, pass
+'--enable-linux-netfilter' to configure and set the 'tproxy' option on
+the HTTP listener you redirect traffic to with the TPROXY iptables
+target.
+
+For more information please consult the following page on the Squid
+wiki: http://wiki.squid-cache.org/Features/Tproxy4
diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt
deleted file mode 100644
index b9a188823d9f..000000000000
--- a/Documentation/networking/tproxy.txt
+++ /dev/null
@@ -1,104 +0,0 @@
-Transparent proxy support
-=========================
-
-This feature adds Linux 2.2-like transparent proxy support to current kernels.
-To use it, enable the socket match and the TPROXY target in your kernel config.
-You will need policy routing too, so be sure to enable that as well.
-
-From Linux 4.18 transparent proxy support is also available in nf_tables.
-
-1. Making non-local sockets work
-================================
-
-The idea is that you identify packets with destination address matching a local
-socket on your box, set the packet mark to a certain value:
-
-# iptables -t mangle -N DIVERT
-# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
-# iptables -t mangle -A DIVERT -j MARK --set-mark 1
-# iptables -t mangle -A DIVERT -j ACCEPT
-
-Alternatively you can do this in nft with the following commands:
-
-# nft add table filter
-# nft add chain filter divert "{ type filter hook prerouting priority -150; }"
-# nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept
-
-And then match on that value using policy routing to have those packets
-delivered locally:
-
-# ip rule add fwmark 1 lookup 100
-# ip route add local 0.0.0.0/0 dev lo table 100
-
-Because of certain restrictions in the IPv4 routing output code you'll have to
-modify your application to allow it to send datagrams _from_ non-local IP
-addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket
-option before calling bind:
-
-fd = socket(AF_INET, SOCK_STREAM, 0);
-/* - 8< -*/
-int value = 1;
-setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value));
-/* - 8< -*/
-name.sin_family = AF_INET;
-name.sin_port = htons(0xCAFE);
-name.sin_addr.s_addr = htonl(0xDEADBEEF);
-bind(fd, &name, sizeof(name));
-
-A trivial patch for netcat is available here:
-http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch
-
-
-2. Redirecting traffic
-======================
-
-Transparent proxying often involves "intercepting" traffic on a router. This is
-usually done with the iptables REDIRECT target; however, there are serious
-limitations of that method. One of the major issues is that it actually
-modifies the packets to change the destination address -- which might not be
-acceptable in certain situations. (Think of proxying UDP for example: you won't
-be able to find out the original destination address. Even in case of TCP
-getting the original destination address is racy.)
-
-The 'TPROXY' target provides similar functionality without relying on NAT. Simply
-add rules like this to the iptables ruleset above:
-
-# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
-  --tproxy-mark 0x1/0x1 --on-port 50080
-
-Or the following rule to nft:
-
-# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept
-
-Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
-IP_TRANSPARENT) for the listening socket.
-
-As an example implementation, tcprdr is available here:
-https://git.breakpoint.cc/cgit/fw/tcprdr.git/
-This tool is written by Florian Westphal and it was used for testing during the
-nf_tables implementation.
-
-3. Iptables and nf_tables extensions
-====================================
-
-To use tproxy you'll need to have the following modules compiled for iptables:
- - NETFILTER_XT_MATCH_SOCKET
- - NETFILTER_XT_TARGET_TPROXY
-
-Or the floowing modules for nf_tables:
- - NFT_SOCKET
- - NFT_TPROXY
-
-4. Application support
-======================
-
-4.1. Squid
-----------
-
-Squid 3.HEAD has support built-in. To use it, pass
-'--enable-linux-netfilter' to configure and set the 'tproxy' option on
-the HTTP listener you redirect traffic to with the TPROXY iptables
-target.
-
-For more information please consult the following page on the Squid
-wiki: http://wiki.squid-cache.org/Features/Tproxy4
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 468fea1aebba..3a3915d2e1ea 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1043,7 +1043,7 @@ config NETFILTER_XT_TARGET_TPROXY
 	  on Netfilter connection tracking and NAT, unlike REDIRECT.
 	  For it to work you will have to configure certain iptables rules
 	  and use policy routing. For more information on how to set it up
-	  see Documentation/networking/tproxy.txt.
+	  see Documentation/networking/tproxy.rst.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-- 
cgit v1.2.3


From 2b195850128f5bafde177b12489d9fa27962cc1e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Apr 2020 10:35:41 -0700
Subject: tcp: add tp->dup_ack_counter

In commit 86de5921a3d5 ("tcp: defer SACK compression after DupThresh")
I added a TCP_FASTRETRANS_THRESH bias to tp->compressed_ack in order
to enable sack compression only after 3 dupacks.

Since we plan to relax this rule for flows that involve
stacks not requiring this old rule, this patch adds
a distinct tp->dup_ack_counter.

This means the TCP_FASTRETRANS_THRESH value is now used
in a single location that a future patch can adjust:

	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
		tp->dup_ack_counter++;
		goto send_now;
	}

This patch also introduces tcp_sack_compress_send_ack()
helper to ease following patch comprehension.

This patch refines LINUX_MIB_TCPACKCOMPRESSED to not
count the acks that we had to send if the timer expires
or tcp_sack_compress_send_ack() is sending an ack.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp_input.c  | 36 +++++++++++++++++++++++++++---------
 net/ipv4/tcp_output.c |  6 +++---
 net/ipv4/tcp_timer.c  |  8 +++++++-
 4 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 421c99c12291..2c6f87e9f0cf 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,7 @@ struct tcp_sock {
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u8	compressed_ack;
+	u8	dup_ack_counter;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf4ced9273e8..da777df0a0ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4327,6 +4327,27 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 	}
 }
 
+static void tcp_sack_compress_send_ack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->compressed_ack)
+		return;
+
+	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+		__sock_put(sk);
+
+	/* Since we have to send one ack finally,
+	 * substract one from tp->compressed_ack to keep
+	 * LINUX_MIB_TCPACKCOMPRESSED accurate.
+	 */
+	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+		      tp->compressed_ack - 1);
+
+	tp->compressed_ack = 0;
+	tcp_send_ack(sk);
+}
+
 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4355,8 +4376,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 	 * If the sack array is full, forget about the last one.
 	 */
 	if (this_sack >= TCP_NUM_SACKS) {
-		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
-			tcp_send_ack(sk);
+		tcp_sack_compress_send_ack(sk);
 		this_sack--;
 		tp->rx_opt.num_sacks--;
 		sp--;
@@ -5275,15 +5295,13 @@ send_now:
 
 	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
 		tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
-		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
-			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
-				      tp->compressed_ack - TCP_FASTRETRANS_THRESH);
-		tp->compressed_ack = 0;
+		tp->dup_ack_counter = 0;
 	}
-
-	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
+	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
+		tp->dup_ack_counter++;
 		goto send_now;
-
+	}
+	tp->compressed_ack++;
 	if (hrtimer_is_queued(&tp->compressed_ack_timer))
 		return;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ba4482130f08..c414aeb1efa9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
+	if (unlikely(tp->compressed_ack)) {
 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
-			      tp->compressed_ack - TCP_FASTRETRANS_THRESH);
-		tp->compressed_ack = TCP_FASTRETRANS_THRESH;
+			      tp->compressed_ack);
+		tp->compressed_ack = 0;
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
 	}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c3f26dcd6704..ada046f425d2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
 
 	bh_lock_sock(sk);
 	if (!sock_owned_by_user(sk)) {
-		if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
+		if (tp->compressed_ack) {
+			/* Since we have to send one ack finally,
+			 * substract one from tp->compressed_ack to keep
+			 * LINUX_MIB_TCPACKCOMPRESSED accurate.
+			 */
+			tp->compressed_ack--;
 			tcp_send_ack(sk);
+		}
 	} else {
 		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
 				      &sk->sk_tsq_flags))
-- 
cgit v1.2.3


From ccd0628fca440268711560a1dbacc727b4f9e214 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Apr 2020 10:35:42 -0700
Subject: tcp: tcp_sack_new_ofo_skb() should be more conservative

Currently, tcp_sack_new_ofo_skb() sends an ack if prior
acks were 'compressed', if room has to be made in tp->selective_acks[]

But there is no guarantee all four sack ranges can be included
in SACK option. As a matter of fact, when TCP timestamps option
is used, only three SACK ranges can be included.

Lets assume only two ranges can be included, and force the ack:

- When we touch more than 2 ranges in the reordering
  done if tcp_sack_extend() could be done.

- If we have at least 2 ranges when adding a new one.

This enforces that before a range is in third or fourth
position, at least one ACK packet included it in first/second
position.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index da777df0a0ba..ef921ecba415 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4348,6 +4348,12 @@ static void tcp_sack_compress_send_ack(struct sock *sk)
 	tcp_send_ack(sk);
 }
 
+/* Reasonable amount of sack blocks included in TCP SACK option
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
+ * Given that SACK packets might be lost, be conservative and use 2.
+ */
+#define TCP_SACK_BLOCKS_EXPECTED 2
+
 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4360,6 +4366,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 
 	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
 		if (tcp_sack_extend(sp, seq, end_seq)) {
+			if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+				tcp_sack_compress_send_ack(sk);
 			/* Rotate this_sack to the first one. */
 			for (; this_sack > 0; this_sack--, sp--)
 				swap(*sp, *(sp - 1));
@@ -4369,6 +4377,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 		}
 	}
 
+	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+		tcp_sack_compress_send_ack(sk);
+
 	/* Could not find an adjacent existing SACK, build a new one,
 	 * put it at the front, and shift everyone else down.  We
 	 * always know there is at least one SACK present already here.
@@ -4376,7 +4387,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 	 * If the sack array is full, forget about the last one.
 	 */
 	if (this_sack >= TCP_NUM_SACKS) {
-		tcp_sack_compress_send_ack(sk);
 		this_sack--;
 		tp->rx_opt.num_sacks--;
 		sp--;
-- 
cgit v1.2.3


From a70437cc09a11771870e9f6bfc0ba1237161daa8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Apr 2020 10:35:43 -0700
Subject: tcp: add hrtimer slack to sack compression

Add a sysctl to control hrtimer slack, default of 100 usec.

This gives the opportunity to reduce system overhead,
and help very short RTT flows.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst | 8 ++++++++
 include/net/netns/ipv4.h               | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 7 +++++++
 net/ipv4/tcp_input.c                   | 5 +++--
 net/ipv4/tcp_ipv4.c                    | 1 +
 5 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 3266aee9e052..50b440d29a13 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -651,6 +651,14 @@ tcp_comp_sack_delay_ns - LONG INTEGER
 
 	Default : 1,000,000 ns (1 ms)
 
+tcp_comp_sack_slack_ns - LONG INTEGER
+	This sysctl control the slack used when arming the
+	timer used by SACK compression. This gives extra time
+	for small RTT flows, and reduces system overhead by allowing
+	opportunistic reduction of timer interrupts.
+
+	Default : 100,000 ns (100 us)
+
 tcp_comp_sack_nr - INTEGER
 	Max number of SACK that can be compressed.
 	Using 0 disables SACK compression.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 5acdb4d414c4..9e36738c1fe1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -173,6 +173,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_rmem[3];
 	int sysctl_tcp_comp_sack_nr;
 	unsigned long sysctl_tcp_comp_sack_delay_ns;
+	unsigned long sysctl_tcp_comp_sack_slack_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 95ad71e76cc3..3a628423d27b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1329,6 +1329,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "tcp_comp_sack_slack_ns",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{
 		.procname	= "tcp_comp_sack_nr",
 		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_nr,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ef921ecba415..d68128a672ab 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5324,8 +5324,9 @@ send_now:
 	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
 		      rtt * (NSEC_PER_USEC >> 3)/20);
 	sock_hold(sk);
-	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
-		      HRTIMER_MODE_REL_PINNED_SOFT);
+	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
+			       sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
+			       HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
 static inline void tcp_ack_snd_check(struct sock *sk)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 83a5d24e13b8..6c05f1ceb538 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2780,6 +2780,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
 	}
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
-- 
cgit v1.2.3


From 34a9c361dd480041d790fff3d6ea58513c8769e8 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 30 Apr 2020 17:37:02 +0000
Subject: hsr: remove hsr interface if all slaves are removed

When all hsr slave interfaces are removed, hsr interface doesn't work.
At that moment, it's fine to remove an unused hsr interface automatically
for saving resources.
That's a common behavior of virtual interfaces.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_main.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index 26d6c39f24e1..e2564de67603 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -15,12 +15,23 @@
 #include "hsr_framereg.h"
 #include "hsr_slave.h"
 
+static bool hsr_slave_empty(struct hsr_priv *hsr)
+{
+	struct hsr_port *port;
+
+	hsr_for_each_port(hsr, port)
+		if (port->type != HSR_PT_MASTER)
+			return false;
+	return true;
+}
+
 static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
 			     void *ptr)
 {
-	struct net_device *dev;
 	struct hsr_port *port, *master;
+	struct net_device *dev;
 	struct hsr_priv *hsr;
+	LIST_HEAD(list_kill);
 	int mtu_max;
 	int res;
 
@@ -85,8 +96,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
 		master->dev->mtu = mtu_max;
 		break;
 	case NETDEV_UNREGISTER:
-		if (!is_hsr_master(dev))
+		if (!is_hsr_master(dev)) {
+			master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
 			hsr_del_port(port);
+			if (hsr_slave_empty(master->hsr)) {
+				unregister_netdevice_queue(master->dev,
+							   &list_kill);
+				unregister_netdevice_many(&list_kill);
+			}
+		}
 		break;
 	case NETDEV_PRE_TYPE_CHANGE:
 		/* HSR works only on Ethernet devices. Refuse slave to change
-- 
cgit v1.2.3


From 47a1494b8208461094923400c396ce4b8163c064 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 30 Apr 2020 22:13:05 +0200
Subject: netlink: remove type-unsafe validation_data pointer

In the netlink policy, we currently have a void *validation_data
that's pointing to different things:
 * a u32 value for bitfield32,
 * the netlink policy for nested/nested array
 * the string for NLA_REJECT

Remove the pointer and place appropriate type-safe items in the
union instead.

While at it, completely dissolve the pointer for the bitfield32
case and just put the value there directly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 60 +++++++++++++++++++++++++++------------------------
 lib/nlattr.c          | 20 ++++++++---------
 net/sched/act_api.c   | 13 +++--------
 net/sched/sch_red.c   |  9 ++++----
 4 files changed, 49 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 67c57d6942e3..671b29d170a8 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -217,7 +217,7 @@ enum nla_policy_validation {
  *    NLA_NESTED,
  *    NLA_NESTED_ARRAY     Length verification is done by checking len of
  *                         nested header (or empty); len field is used if
- *                         validation_data is also used, for the max attr
+ *                         nested_policy is also used, for the max attr
  *                         number in the nested policy.
  *    NLA_U8, NLA_U16,
  *    NLA_U32, NLA_U64,
@@ -235,27 +235,25 @@ enum nla_policy_validation {
  *    NLA_MIN_LEN          Minimum length of attribute payload
  *    All other            Minimum length of attribute payload
  *
- * Meaning of `validation_data' field:
+ * Meaning of validation union:
  *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
- *                         validation data must point to a u32 value of valid
- *                         flags
- *    NLA_REJECT           This attribute is always rejected and validation data
+ *                         `bitfield32_valid' is the u32 value of valid flags
+ *    NLA_REJECT           This attribute is always rejected and `reject_message'
  *                         may point to a string to report as the error instead
  *                         of the generic one in extended ACK.
- *    NLA_NESTED           Points to a nested policy to validate, must also set
- *                         `len' to the max attribute number.
+ *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
+ *                         also set `len' to the max attribute number. Use the
+ *                         provided NLA_POLICY_NESTED() macro.
  *                         Note that nla_parse() will validate, but of course not
  *                         parse, the nested sub-policies.
- *    NLA_NESTED_ARRAY     Points to a nested policy to validate, must also set
- *                         `len' to the max attribute number. The difference to
- *                         NLA_NESTED is the structure - NLA_NESTED has the
- *                         nested attributes directly inside, while an array has
- *                         the nested attributes at another level down and the
- *                         attributes directly in the nesting don't matter.
- *    All other            Unused - but note that it's a union
- *
- * Meaning of `min' and `max' fields, use via NLA_POLICY_MIN, NLA_POLICY_MAX
- * and NLA_POLICY_RANGE:
+ *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
+ *                         must also set `len' to the max attribute number. Use
+ *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
+ *                         The difference to NLA_NESTED is the structure:
+ *                         NLA_NESTED has the nested attributes directly inside
+ *                         while an array has the nested attributes at another
+ *                         level down and the attribute types directly in the
+ *                         nesting don't matter.
  *    NLA_U8,
  *    NLA_U16,
  *    NLA_U32,
@@ -263,29 +261,31 @@ enum nla_policy_validation {
  *    NLA_S8,
  *    NLA_S16,
  *    NLA_S32,
- *    NLA_S64              These are used depending on the validation_type
- *                         field, if that is min/max/range then the minimum,
- *                         maximum and both are used (respectively) to check
+ *    NLA_S64              The `min' and `max' fields are used depending on the
+ *                         validation_type field, if that is min/max/range then
+ *                         the min, max or both are used (respectively) to check
  *                         the value of the integer attribute.
  *                         Note that in the interest of code simplicity and
  *                         struct size both limits are s16, so you cannot
  *                         enforce a range that doesn't fall within the range
  *                         of s16 - do that as usual in the code instead.
+ *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
+ *                         NLA_POLICY_RANGE() macros.
  *    All other            Unused - but note that it's a union
  *
  * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
- *    NLA_BINARY           Validation function called for the attribute,
- *                         not compatible with use of the validation_data
- *                         as in NLA_BITFIELD32, NLA_REJECT, NLA_NESTED and
- *                         NLA_NESTED_ARRAY.
+ *    NLA_BINARY           Validation function called for the attribute.
  *    All other            Unused - but note that it's a union
  *
  * Example:
+ *
+ * static const u32 myvalidflags = 0xff231023;
+ *
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
  *	[ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
  *	[ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) },
- *	[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },
+ *	[ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
  * };
  */
 struct nla_policy {
@@ -293,7 +293,9 @@ struct nla_policy {
 	u8		validation_type;
 	u16		len;
 	union {
-		const void *validation_data;
+		const u32 bitfield32_valid;
+		const char *reject_message;
+		const struct nla_policy *nested_policy;
 		struct {
 			s16 min, max;
 		};
@@ -329,13 +331,15 @@ struct nla_policy {
 #define NLA_POLICY_ETH_ADDR_COMPAT	NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
 
 #define _NLA_POLICY_NESTED(maxattr, policy) \
-	{ .type = NLA_NESTED, .validation_data = policy, .len = maxattr }
+	{ .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
 #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
-	{ .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr }
+	{ .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
 #define NLA_POLICY_NESTED(policy) \
 	_NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
 #define NLA_POLICY_NESTED_ARRAY(policy) \
 	_NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
+#define NLA_POLICY_BITFIELD32(valid) \
+	{ .type = NLA_BITFIELD32, .bitfield32_valid = valid }
 
 #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
 #define NLA_ENSURE_INT_TYPE(tp)				\
diff --git a/lib/nlattr.c b/lib/nlattr.c
index cace9b307781..3df05db732ca 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -45,7 +45,7 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 };
 
 static int validate_nla_bitfield32(const struct nlattr *nla,
-				   const u32 *valid_flags_mask)
+				   const u32 valid_flags_mask)
 {
 	const struct nla_bitfield32 *bf = nla_data(nla);
 
@@ -53,11 +53,11 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
 		return -EINVAL;
 
 	/*disallow invalid bit selector */
-	if (bf->selector & ~*valid_flags_mask)
+	if (bf->selector & ~valid_flags_mask)
 		return -EINVAL;
 
 	/*disallow invalid bit values */
-	if (bf->value & ~*valid_flags_mask)
+	if (bf->value & ~valid_flags_mask)
 		return -EINVAL;
 
 	/*disallow valid bit values that are not selected*/
@@ -206,9 +206,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 		break;
 
 	case NLA_REJECT:
-		if (extack && pt->validation_data) {
+		if (extack && pt->reject_message) {
 			NL_SET_BAD_ATTR(extack, nla);
-			extack->_msg = pt->validation_data;
+			extack->_msg = pt->reject_message;
 			return -EINVAL;
 		}
 		err = -EINVAL;
@@ -223,7 +223,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 		if (attrlen != sizeof(struct nla_bitfield32))
 			goto out_err;
 
-		err = validate_nla_bitfield32(nla, pt->validation_data);
+		err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
 		if (err)
 			goto out_err;
 		break;
@@ -268,9 +268,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			break;
 		if (attrlen < NLA_HDRLEN)
 			goto out_err;
-		if (pt->validation_data) {
+		if (pt->nested_policy) {
 			err = __nla_validate(nla_data(nla), nla_len(nla), pt->len,
-					     pt->validation_data, validate,
+					     pt->nested_policy, validate,
 					     extack);
 			if (err < 0) {
 				/*
@@ -289,11 +289,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			break;
 		if (attrlen < NLA_HDRLEN)
 			goto out_err;
-		if (pt->validation_data) {
+		if (pt->nested_policy) {
 			int err;
 
 			err = nla_validate_array(nla_data(nla), nla_len(nla),
-						 pt->len, pt->validation_data,
+						 pt->len, pt->nested_policy,
 						 extack, validate);
 			if (err < 0) {
 				/*
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index df4560909157..fbbec2e562f5 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -876,19 +876,14 @@ static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr)
 	return hw_stats_bf.value;
 }
 
-static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS;
-static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY;
-
 static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
 	[TCA_ACT_KIND]		= { .type = NLA_STRING },
 	[TCA_ACT_INDEX]		= { .type = NLA_U32 },
 	[TCA_ACT_COOKIE]	= { .type = NLA_BINARY,
 				    .len = TC_COOKIE_MAX_SIZE },
 	[TCA_ACT_OPTIONS]	= { .type = NLA_NESTED },
-	[TCA_ACT_FLAGS]		= { .type = NLA_BITFIELD32,
-				    .validation_data = &tca_act_flags_allowed },
-	[TCA_ACT_HW_STATS]	= { .type = NLA_BITFIELD32,
-				    .validation_data = &tca_act_hw_stats_allowed },
+	[TCA_ACT_FLAGS]		= NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS),
+	[TCA_ACT_HW_STATS]	= NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY),
 };
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
@@ -1454,10 +1449,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return ret;
 }
 
-static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
 static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
-	[TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
-			     .validation_data = &tcaa_root_flags_allowed },
+	[TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON),
 	[TCA_ROOT_TIME_DELTA]      = { .type = NLA_U32 },
 };
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index c7de47c942e3..555a1b9e467f 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -48,7 +48,7 @@ struct red_sched_data {
 	struct Qdisc		*qdisc;
 };
 
-static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP;
+#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP)
 
 static inline int red_use_ecn(struct red_sched_data *q)
 {
@@ -212,8 +212,7 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
 	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
 	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
 	[TCA_RED_MAX_P] = { .type = NLA_U32 },
-	[TCA_RED_FLAGS] = { .type = NLA_BITFIELD32,
-			    .validation_data = &red_supported_flags },
+	[TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS),
 };
 
 static int red_change(struct Qdisc *sch, struct nlattr *opt,
@@ -248,7 +247,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 
 	err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
-			    tb[TCA_RED_FLAGS], red_supported_flags,
+			    tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS,
 			    &flags_bf, &userbits, extack);
 	if (err)
 		return err;
@@ -372,7 +371,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
 	    nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) ||
 	    nla_put_bitfield32(skb, TCA_RED_FLAGS,
-			       q->flags, red_supported_flags))
+			       q->flags, TC_RED_SUPPORTED_FLAGS))
 		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
-- 
cgit v1.2.3


From d15da2a2e813679aeac8bff3be38d3adc849c1a6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 30 Apr 2020 22:13:07 +0200
Subject: nl80211: link recursive netlink nested policy

Now that we have limited recursive policy validation to avoid
stack overflows, change nl80211 to actually link the nested
policy (linking back to itself eventually), which allows some
code cleanups.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/nl80211.c | 10 ++++------
 net/wireless/nl80211.h |  2 --
 net/wireless/pmsr.c    |  3 +--
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 692bcd35f809..57c618b6cb0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -253,6 +253,8 @@ static int validate_ie_attr(const struct nlattr *attr,
 }
 
 /* policy for the attributes */
+static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
+
 static const struct nla_policy
 nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
 	[NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, },
@@ -296,11 +298,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = {
 static const struct nla_policy
 nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = {
 	[NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR,
-	/*
-	 * we could specify this again to be the top-level policy,
-	 * but that would open us up to recursion problems ...
-	 */
-	[NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED },
+	[NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy),
 	[NL80211_PMSR_PEER_ATTR_REQ] =
 		NLA_POLICY_NESTED(nl80211_pmsr_req_attr_policy),
 	[NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT },
@@ -347,7 +345,7 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
 			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
 };
 
-const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
+static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
 	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a41e94a49a89..d3e8e426c486 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,8 +11,6 @@
 int nl80211_init(void);
 void nl80211_exit(void);
 
-extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
-
 void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
 		     int flags, u8 cmd);
 bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 63dc8023447f..a95c79d18349 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -187,10 +187,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev,
 
 	/* reuse info->attrs */
 	memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1));
-	/* need to validate here, we don't want to have validation recursion */
 	err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX,
 					  tb[NL80211_PMSR_PEER_ATTR_CHAN],
-					  nl80211_policy, info->extack);
+					  NULL, info->extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From c7721c05a6217491810f406ec28df80a9bcf3546 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 30 Apr 2020 22:13:10 +0200
Subject: netlink: remove NLA_EXACT_LEN_WARN

Use a validation type instead, so we can later expose
the NLA_* values to userspace for policy descriptions.

Some transformations were done with this spatch:

    @@
    identifier p;
    expression X, L, A;
    @@
    struct nla_policy p[X] = {
    [A] =
    -{ .type = NLA_EXACT_LEN_WARN, .len = L },
    +NLA_POLICY_EXACT_LEN_WARN(L),
    ...
    };

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h  | 15 +++++-----
 lib/nlattr.c           | 16 ++++++----
 net/wireless/nl80211.c | 81 +++++++++++---------------------------------------
 3 files changed, 36 insertions(+), 76 deletions(-)

(limited to 'net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 4acd7165e900..4d4a733f1e8d 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -182,7 +182,6 @@ enum {
 	NLA_BITFIELD32,
 	NLA_REJECT,
 	NLA_EXACT_LEN,
-	NLA_EXACT_LEN_WARN,
 	NLA_MIN_LEN,
 	__NLA_TYPE_MAX,
 };
@@ -204,6 +203,7 @@ enum nla_policy_validation {
 	NLA_VALIDATE_MAX,
 	NLA_VALIDATE_RANGE_PTR,
 	NLA_VALIDATE_FUNCTION,
+	NLA_VALIDATE_WARN_TOO_LONG,
 };
 
 /**
@@ -237,10 +237,10 @@ enum nla_policy_validation {
  *                         just like "All other"
  *    NLA_BITFIELD32       Unused
  *    NLA_REJECT           Unused
- *    NLA_EXACT_LEN        Attribute must have exactly this length, otherwise
- *                         it is rejected.
- *    NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a warning
- *                         is logged if it is longer, shorter is rejected.
+ *    NLA_EXACT_LEN        Attribute should have exactly this length, otherwise
+ *                         it is rejected or warned about, the latter happening
+ *                         if and only if the `validation_type' is set to
+ *                         NLA_VALIDATE_WARN_TOO_LONG.
  *    NLA_MIN_LEN          Minimum length of attribute payload
  *    All other            Minimum length of attribute payload
  *
@@ -350,8 +350,9 @@ struct nla_policy {
 };
 
 #define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
-#define NLA_POLICY_EXACT_LEN_WARN(_len)	{ .type = NLA_EXACT_LEN_WARN, \
-					  .len = _len }
+#define NLA_POLICY_EXACT_LEN_WARN(_len) \
+	{ .type = NLA_EXACT_LEN, .len = _len, \
+	  .validation_type = NLA_VALIDATE_WARN_TOO_LONG, }
 #define NLA_POLICY_MIN_LEN(_len)	{ .type = NLA_MIN_LEN, .len = _len }
 
 #define NLA_POLICY_ETH_ADDR		NLA_POLICY_EXACT_LEN(ETH_ALEN)
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 21ef3998b9d9..6dcbe1bedd3b 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -261,7 +261,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 	BUG_ON(pt->type > NLA_TYPE_MAX);
 
 	if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) ||
-	    (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) {
+	    (pt->type == NLA_EXACT_LEN &&
+	     pt->validation_type == NLA_VALIDATE_WARN_TOO_LONG &&
+	     attrlen != pt->len)) {
 		pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
 				    current->comm, type);
 		if (validate & NL_VALIDATE_STRICT_ATTRS) {
@@ -287,11 +289,6 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 	}
 
 	switch (pt->type) {
-	case NLA_EXACT_LEN:
-		if (attrlen != pt->len)
-			goto out_err;
-		break;
-
 	case NLA_REJECT:
 		if (extack && pt->reject_message) {
 			NL_SET_BAD_ATTR(extack, nla);
@@ -405,6 +402,13 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			goto out_err;
 		break;
 
+	case NLA_EXACT_LEN:
+		if (pt->validation_type != NLA_VALIDATE_WARN_TOO_LONG) {
+			if (attrlen != pt->len)
+				goto out_err;
+			break;
+		}
+		/* fall through */
 	default:
 		if (pt->len)
 			minlen = pt->len;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 57c618b6cb0e..519414468b5d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -376,11 +376,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
 	[NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
 
-	[NL80211_ATTR_MAC] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN },
-	[NL80211_ATTR_PREV_BSSID] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_ATTR_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
+	[NL80211_ATTR_PREV_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 
 	[NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
 	[NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
@@ -432,10 +429,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED },
 	[NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG },
 
-	[NL80211_ATTR_HT_CAPABILITY] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = NL80211_HT_CAPABILITY_LEN
-	},
+	[NL80211_ATTR_HT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_HT_CAPABILITY_LEN),
 
 	[NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
 	[NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
@@ -466,10 +460,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
 	[NL80211_ATTR_PID] = { .type = NLA_U32 },
 	[NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
-	[NL80211_ATTR_PMKID] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = WLAN_PMKID_LEN
-	},
+	[NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN),
 	[NL80211_ATTR_DURATION] = { .type = NLA_U32 },
 	[NL80211_ATTR_COOKIE] = { .type = NLA_U64 },
 	[NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED },
@@ -533,10 +524,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_WDEV] = { .type = NLA_U64 },
 	[NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
-	[NL80211_ATTR_VHT_CAPABILITY] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = NL80211_VHT_CAPABILITY_LEN
-	},
+	[NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN),
 	[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
 	[NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
 	[NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1),
@@ -574,10 +562,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
 	[NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY,
 				   .len = IEEE80211_QOS_MAP_LEN_MAX },
-	[NL80211_ATTR_MAC_HINT] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 },
 	[NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 },
 	[NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG },
@@ -589,10 +574,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
 	[NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
 	[NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 },
-	[NL80211_ATTR_MAC_MASK] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_ATTR_MAC_MASK] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG },
 	[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
 	[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
@@ -604,21 +586,15 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
 		.len = VHT_MUMIMO_GROUPS_DATA_LEN
 	},
-	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1),
 	[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
 	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
 	[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
 				    .len = FILS_MAX_KEK_LEN },
-	[NL80211_ATTR_FILS_NONCES] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = 2 * FILS_NONCE_LEN
-	},
+	[NL80211_ATTR_FILS_NONCES] = NLA_POLICY_EXACT_LEN_WARN(2 * FILS_NONCE_LEN),
 	[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
-	[NL80211_ATTR_BSSID] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN },
+	[NL80211_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
 	[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
 		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
@@ -631,7 +607,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 },
 	[NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY,
 					.len = FILS_ERP_MAX_RRK_LEN },
-	[NL80211_ATTR_FILS_CACHE_ID] = { .type = NLA_EXACT_LEN_WARN, .len = 2 },
+	[NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2),
 	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 	[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
 	[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
@@ -701,10 +677,7 @@ static const struct nla_policy
 nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
 	[NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 },
 	[NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 },
-	[NL80211_WOWLAN_TCP_DST_MAC] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 },
 	[NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 },
 	[NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 },
@@ -734,18 +707,9 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
 /* policy for GTK rekey offload attributes */
 static const struct nla_policy
 nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
-	[NL80211_REKEY_DATA_KEK] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = NL80211_KEK_LEN,
-	},
-	[NL80211_REKEY_DATA_KCK] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = NL80211_KCK_LEN,
-	},
-	[NL80211_REKEY_DATA_REPLAY_CTR] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = NL80211_REPLAY_CTR_LEN
-	},
+	[NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN),
+	[NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN),
+	[NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN),
 };
 
 static const struct nla_policy
@@ -760,10 +724,7 @@ static const struct nla_policy
 nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
 	[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY,
 						 .len = IEEE80211_MAX_SSID_LEN },
-	[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 },
 	[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] =
 		NLA_POLICY_NESTED(nl80211_match_band_rssi_policy),
@@ -795,10 +756,7 @@ nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
 	[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG },
 	[NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 },
 	[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 },
-	[NL80211_NAN_FUNC_FOLLOW_UP_DEST] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = ETH_ALEN
-	},
+	[NL80211_NAN_FUNC_FOLLOW_UP_DEST] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG },
 	[NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 },
 	[NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY,
@@ -4404,10 +4362,7 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
 				    .len = NL80211_MAX_SUPP_RATES },
 	[NL80211_TXRATE_HT] = { .type = NLA_BINARY,
 				.len = NL80211_MAX_SUPP_HT_RATES },
-	[NL80211_TXRATE_VHT] = {
-		.type = NLA_EXACT_LEN_WARN,
-		.len = sizeof(struct nl80211_txrate_vht),
-	},
+	[NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
 	[NL80211_TXRATE_GI] = { .type = NLA_U8 },
 };
 
-- 
cgit v1.2.3


From d07dcf9aadd6b2842b439e8668ff7ea2873f28d7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 30 Apr 2020 22:13:12 +0200
Subject: netlink: add infrastructure to expose policies to userspace

Add, and use in generic netlink, helpers to dump out a netlink
policy to userspace, including all the range validation data,
nested policies etc.

This lets userspace discover what the kernel understands.

For families/commands other than generic netlink, the helpers
need to be used directly in an appropriate command, or we can
add some infrastructure (a new netlink family) that those can
register their policies with for introspection. I'm not that
familiar with non-generic netlink, so that's left out for now.

The data exposed to userspace also includes min and max length
for binary/string data, I've done that instead of letting the
userspace tools figure out whether min/max is intended based
on the type so that we can extend this later in the kernel, we
might want to just use the range data for example.

Because of this, I opted to not directly expose the NLA_*
values, even if some of them are already exposed via BPF, as
with min/max length we don't need to have different types here
for NLA_BINARY/NLA_MIN_LEN/NLA_EXACT_LEN, we just make them
all NL_ATTR_TYPE_BINARY with min/max length optionally set.

Similarly, we don't really need NLA_MSECS, and perhaps can
remove it in the future - but not if we encode it into the
userspace API now. It gets mapped to NL_ATTR_TYPE_U64 here.

Note that the exposing here corresponds to the strict policy
interpretation, and NLA_UNSPEC items are omitted entirely.
To get those, change them to NLA_MIN_LEN which behaves in
exactly the same way, but is exposed.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h          |   6 +
 include/uapi/linux/genetlink.h |   2 +
 include/uapi/linux/netlink.h   | 103 ++++++++++++++
 net/netlink/Makefile           |   2 +-
 net/netlink/genetlink.c        |  78 +++++++++++
 net/netlink/policy.c           | 308 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 net/netlink/policy.c

(limited to 'net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 557b67f1db99..c0411f14fb53 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1933,4 +1933,10 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
 void nla_get_range_signed(const struct nla_policy *pt,
 			  struct netlink_range_validation_signed *range);
 
+int netlink_policy_dump_start(const struct nla_policy *policy,
+			      unsigned int maxtype,
+			      unsigned long *state);
+bool netlink_policy_dump_loop(unsigned long *state);
+int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state);
+
 #endif
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index 877f7fa95466..9c0636ec2286 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -48,6 +48,7 @@ enum {
 	CTRL_CMD_NEWMCAST_GRP,
 	CTRL_CMD_DELMCAST_GRP,
 	CTRL_CMD_GETMCAST_GRP, /* unused */
+	CTRL_CMD_GETPOLICY,
 	__CTRL_CMD_MAX,
 };
 
@@ -62,6 +63,7 @@ enum {
 	CTRL_ATTR_MAXATTR,
 	CTRL_ATTR_OPS,
 	CTRL_ATTR_MCAST_GROUPS,
+	CTRL_ATTR_POLICY,
 	__CTRL_ATTR_MAX,
 };
 
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 0a4d73317759..eac8a6a648ea 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -249,4 +249,107 @@ struct nla_bitfield32 {
 	__u32 selector;
 };
 
+/*
+ * policy descriptions - it's specific to each family how this is used
+ * Normally, it should be retrieved via a dump inside another attribute
+ * specifying where it applies.
+ */
+
+/**
+ * enum netlink_attribute_type - type of an attribute
+ * @NL_ATTR_TYPE_INVALID: unused
+ * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present)
+ * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute
+ * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute
+ * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute
+ * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute
+ * @NL_ATTR_TYPE_S8: 8-bit signed attribute
+ * @NL_ATTR_TYPE_S16: 16-bit signed attribute
+ * @NL_ATTR_TYPE_S32: 32-bit signed attribute
+ * @NL_ATTR_TYPE_S64: 64-bit signed attribute
+ * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified
+ * @NL_ATTR_TYPE_STRING: string, min/max length may be specified
+ * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string,
+ *	min/max length may be specified
+ * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute
+ *	consists of sub-attributes. The nested policy and maxtype
+ *	inside may be specified.
+ * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this
+ *	attribute contains sub-attributes whose type is irrelevant
+ *	(just used to separate the array entries) and each such array
+ *	entry has attributes again, the policy for those inner ones
+ *	and the corresponding maxtype may be specified.
+ * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute
+ */
+enum netlink_attribute_type {
+	NL_ATTR_TYPE_INVALID,
+
+	NL_ATTR_TYPE_FLAG,
+
+	NL_ATTR_TYPE_U8,
+	NL_ATTR_TYPE_U16,
+	NL_ATTR_TYPE_U32,
+	NL_ATTR_TYPE_U64,
+
+	NL_ATTR_TYPE_S8,
+	NL_ATTR_TYPE_S16,
+	NL_ATTR_TYPE_S32,
+	NL_ATTR_TYPE_S64,
+
+	NL_ATTR_TYPE_BINARY,
+	NL_ATTR_TYPE_STRING,
+	NL_ATTR_TYPE_NUL_STRING,
+
+	NL_ATTR_TYPE_NESTED,
+	NL_ATTR_TYPE_NESTED_ARRAY,
+
+	NL_ATTR_TYPE_BITFIELD32,
+};
+
+/**
+ * enum netlink_policy_type_attr - policy type attributes
+ * @NL_POLICY_TYPE_ATTR_UNSPEC: unused
+ * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute,
+ *	&enum netlink_attribute_type (U32)
+ * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed
+ *	integers (S64)
+ * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed
+ *	integers (S64)
+ * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned
+ *	integers (U64)
+ * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned
+ *	integers (U64)
+ * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary
+ *	attributes, no minimum if not given (U32)
+ * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary
+ *	attributes, no maximum if not given (U32)
+ * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and
+ *	nested array types (U32)
+ * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy
+ *	attribute for nested and nested array types, this can
+ *	in theory be < the size of the policy pointed to by
+ *	the index, if limited inside the nesting (U32)
+ * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the
+ *	bitfield32 type (U32)
+ * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment
+ */
+enum netlink_policy_type_attr {
+	NL_POLICY_TYPE_ATTR_UNSPEC,
+	NL_POLICY_TYPE_ATTR_TYPE,
+	NL_POLICY_TYPE_ATTR_MIN_VALUE_S,
+	NL_POLICY_TYPE_ATTR_MAX_VALUE_S,
+	NL_POLICY_TYPE_ATTR_MIN_VALUE_U,
+	NL_POLICY_TYPE_ATTR_MAX_VALUE_U,
+	NL_POLICY_TYPE_ATTR_MIN_LENGTH,
+	NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+	NL_POLICY_TYPE_ATTR_POLICY_IDX,
+	NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE,
+	NL_POLICY_TYPE_ATTR_BITFIELD32_MASK,
+	NL_POLICY_TYPE_ATTR_PAD,
+
+	/* keep last */
+	__NL_POLICY_TYPE_ATTR_MAX,
+	NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1
+};
+
 #endif /* _UAPI__LINUX_NETLINK_H */
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
index de42df7f0068..e05202708c90 100644
--- a/net/netlink/Makefile
+++ b/net/netlink/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the netlink driver.
 #
 
-obj-y  				:= af_netlink.o genetlink.o
+obj-y  				:= af_netlink.o genetlink.o policy.o
 
 obj-$(CONFIG_NETLINK_DIAG)	+= netlink_diag.o
 netlink_diag-y			:= diag.o
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 9f357aa22b94..2f049692e012 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1043,6 +1043,80 @@ static int genl_ctrl_event(int event, const struct genl_family *family,
 	return 0;
 }
 
+static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct genl_family *rt;
+	unsigned int fam_id = cb->args[0];
+	int err;
+
+	if (!fam_id) {
+		struct nlattr *tb[CTRL_ATTR_MAX + 1];
+
+		err = genlmsg_parse(cb->nlh, &genl_ctrl, tb,
+				    genl_ctrl.maxattr,
+				    genl_ctrl.policy, cb->extack);
+		if (err)
+			return err;
+
+		if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME])
+			return -EINVAL;
+
+		if (tb[CTRL_ATTR_FAMILY_ID]) {
+			fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]);
+		} else {
+			rt = genl_family_find_byname(
+				nla_data(tb[CTRL_ATTR_FAMILY_NAME]));
+			if (!rt)
+				return -ENOENT;
+			fam_id = rt->id;
+		}
+	}
+
+	rt = genl_family_find_byid(fam_id);
+	if (!rt)
+		return -ENOENT;
+
+	if (!rt->policy)
+		return -ENODATA;
+
+	err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]);
+	if (err)
+		return err;
+
+	while (netlink_policy_dump_loop(&cb->args[1])) {
+		void *hdr;
+		struct nlattr *nest;
+
+		hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, &genl_ctrl,
+				  NLM_F_MULTI, CTRL_CMD_GETPOLICY);
+		if (!hdr)
+			goto nla_put_failure;
+
+		if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id))
+			goto nla_put_failure;
+
+		nest = nla_nest_start(skb, CTRL_ATTR_POLICY);
+		if (!nest)
+			goto nla_put_failure;
+
+		if (netlink_policy_dump_write(skb, cb->args[1]))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, nest);
+
+		genlmsg_end(skb, hdr);
+		continue;
+
+nla_put_failure:
+		genlmsg_cancel(skb, hdr);
+		break;
+	}
+
+	cb->args[0] = fam_id;
+	return skb->len;
+}
+
 static const struct genl_ops genl_ctrl_ops[] = {
 	{
 		.cmd		= CTRL_CMD_GETFAMILY,
@@ -1050,6 +1124,10 @@ static const struct genl_ops genl_ctrl_ops[] = {
 		.doit		= ctrl_getfamily,
 		.dumpit		= ctrl_dumpfamily,
 	},
+	{
+		.cmd		= CTRL_CMD_GETPOLICY,
+		.dumpit		= ctrl_dumppolicy,
+	},
 };
 
 static const struct genl_multicast_group genl_ctrl_groups[] = {
diff --git a/net/netlink/policy.c b/net/netlink/policy.c
new file mode 100644
index 000000000000..f6491853c797
--- /dev/null
+++ b/net/netlink/policy.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NETLINK      Policy advertisement to userspace
+ *
+ * 		Authors:	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * Copyright 2019 Intel Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+
+#define INITIAL_POLICIES_ALLOC	10
+
+struct nl_policy_dump {
+	unsigned int policy_idx;
+	unsigned int attr_idx;
+	unsigned int n_alloc;
+	struct {
+		const struct nla_policy *policy;
+		unsigned int maxtype;
+	} policies[];
+};
+
+static int add_policy(struct nl_policy_dump **statep,
+		      const struct nla_policy *policy,
+		      unsigned int maxtype)
+{
+	struct nl_policy_dump *state = *statep;
+	unsigned int n_alloc, i;
+
+	if (!policy || !maxtype)
+		return 0;
+
+	for (i = 0; i < state->n_alloc; i++) {
+		if (state->policies[i].policy == policy)
+			return 0;
+
+		if (!state->policies[i].policy) {
+			state->policies[i].policy = policy;
+			state->policies[i].maxtype = maxtype;
+			return 0;
+		}
+	}
+
+	n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC;
+	state = krealloc(state, struct_size(state, policies, n_alloc),
+			 GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	state->policies[state->n_alloc].policy = policy;
+	state->policies[state->n_alloc].maxtype = maxtype;
+	state->n_alloc = n_alloc;
+	*statep = state;
+
+	return 0;
+}
+
+static unsigned int get_policy_idx(struct nl_policy_dump *state,
+				   const struct nla_policy *policy)
+{
+	unsigned int i;
+
+	for (i = 0; i < state->n_alloc; i++) {
+		if (state->policies[i].policy == policy)
+			return i;
+	}
+
+	WARN_ON_ONCE(1);
+	return -1;
+}
+
+int netlink_policy_dump_start(const struct nla_policy *policy,
+			      unsigned int maxtype,
+                              unsigned long *_state)
+{
+	struct nl_policy_dump *state;
+	unsigned int policy_idx;
+	int err;
+
+	/* also returns 0 if "*_state" is our ERR_PTR() end marker */
+	if (*_state)
+		return 0;
+
+	/*
+	 * walk the policies and nested ones first, and build
+	 * a linear list of them.
+	 */
+
+	state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC),
+			GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	state->n_alloc = INITIAL_POLICIES_ALLOC;
+
+	err = add_policy(&state, policy, maxtype);
+	if (err)
+		return err;
+
+	for (policy_idx = 0;
+	     policy_idx < state->n_alloc && state->policies[policy_idx].policy;
+	     policy_idx++) {
+		const struct nla_policy *policy;
+		unsigned int type;
+
+		policy = state->policies[policy_idx].policy;
+
+		for (type = 0;
+		     type <= state->policies[policy_idx].maxtype;
+		     type++) {
+			switch (policy[type].type) {
+			case NLA_NESTED:
+			case NLA_NESTED_ARRAY:
+				err = add_policy(&state,
+						 policy[type].nested_policy,
+						 policy[type].len);
+				if (err)
+					return err;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	*_state = (unsigned long)state;
+
+	return 0;
+}
+
+static bool netlink_policy_dump_finished(struct nl_policy_dump *state)
+{
+	return state->policy_idx >= state->n_alloc ||
+	       !state->policies[state->policy_idx].policy;
+}
+
+bool netlink_policy_dump_loop(unsigned long *_state)
+{
+	struct nl_policy_dump *state = (void *)*_state;
+
+	if (IS_ERR(state))
+		return false;
+
+	if (netlink_policy_dump_finished(state)) {
+		kfree(state);
+		/* store end marker instead of freed state */
+		*_state = (unsigned long)ERR_PTR(-ENOENT);
+		return false;
+	}
+
+	return true;
+}
+
+int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state)
+{
+	struct nl_policy_dump *state = (void *)_state;
+	const struct nla_policy *pt;
+	struct nlattr *policy, *attr;
+	enum netlink_attribute_type type;
+	bool again;
+
+send_attribute:
+	again = false;
+
+	pt = &state->policies[state->policy_idx].policy[state->attr_idx];
+
+	policy = nla_nest_start(skb, state->policy_idx);
+	if (!policy)
+		return -ENOBUFS;
+
+	attr = nla_nest_start(skb, state->attr_idx);
+	if (!attr)
+		goto nla_put_failure;
+
+	switch (pt->type) {
+	default:
+	case NLA_UNSPEC:
+	case NLA_REJECT:
+		/* skip - use NLA_MIN_LEN to advertise such */
+		nla_nest_cancel(skb, policy);
+		again = true;
+		goto next;
+	case NLA_NESTED:
+		type = NL_ATTR_TYPE_NESTED;
+		/* fall through */
+	case NLA_NESTED_ARRAY:
+		if (pt->type == NLA_NESTED_ARRAY)
+			type = NL_ATTR_TYPE_NESTED_ARRAY;
+		if (pt->nested_policy && pt->len &&
+		    (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX,
+				 get_policy_idx(state, pt->nested_policy)) ||
+		     nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE,
+				 pt->len)))
+			goto nla_put_failure;
+		break;
+	case NLA_U8:
+	case NLA_U16:
+	case NLA_U32:
+	case NLA_U64:
+	case NLA_MSECS: {
+		struct netlink_range_validation range;
+
+		if (pt->type == NLA_U8)
+			type = NL_ATTR_TYPE_U8;
+		else if (pt->type == NLA_U16)
+			type = NL_ATTR_TYPE_U16;
+		else if (pt->type == NLA_U32)
+			type = NL_ATTR_TYPE_U32;
+		else
+			type = NL_ATTR_TYPE_U64;
+
+		nla_get_range_unsigned(pt, &range);
+
+		if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U,
+				      range.min, NL_POLICY_TYPE_ATTR_PAD) ||
+		    nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U,
+				      range.max, NL_POLICY_TYPE_ATTR_PAD))
+			goto nla_put_failure;
+		break;
+	}
+	case NLA_S8:
+	case NLA_S16:
+	case NLA_S32:
+	case NLA_S64: {
+		struct netlink_range_validation_signed range;
+
+		if (pt->type == NLA_S8)
+			type = NL_ATTR_TYPE_S8;
+		else if (pt->type == NLA_S16)
+			type = NL_ATTR_TYPE_S16;
+		else if (pt->type == NLA_S32)
+			type = NL_ATTR_TYPE_S32;
+		else
+			type = NL_ATTR_TYPE_S64;
+
+		nla_get_range_signed(pt, &range);
+
+		if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S,
+				range.min, NL_POLICY_TYPE_ATTR_PAD) ||
+		    nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S,
+				range.max, NL_POLICY_TYPE_ATTR_PAD))
+			goto nla_put_failure;
+		break;
+	}
+	case NLA_BITFIELD32:
+		type = NL_ATTR_TYPE_BITFIELD32;
+		if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK,
+				pt->bitfield32_valid))
+			goto nla_put_failure;
+		break;
+	case NLA_EXACT_LEN:
+		type = NL_ATTR_TYPE_BINARY;
+		if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) ||
+		    nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len))
+			goto nla_put_failure;
+		break;
+	case NLA_STRING:
+	case NLA_NUL_STRING:
+	case NLA_BINARY:
+		if (pt->type == NLA_STRING)
+			type = NL_ATTR_TYPE_STRING;
+		else if (pt->type == NLA_NUL_STRING)
+			type = NL_ATTR_TYPE_NUL_STRING;
+		else
+			type = NL_ATTR_TYPE_BINARY;
+		if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH,
+					   pt->len))
+			goto nla_put_failure;
+		break;
+	case NLA_MIN_LEN:
+		type = NL_ATTR_TYPE_BINARY;
+		if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len))
+			goto nla_put_failure;
+		break;
+	case NLA_FLAG:
+		type = NL_ATTR_TYPE_FLAG;
+		break;
+	}
+
+	if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type))
+		goto nla_put_failure;
+
+	/* finish and move state to next attribute */
+	nla_nest_end(skb, attr);
+	nla_nest_end(skb, policy);
+
+next:
+	state->attr_idx += 1;
+	if (state->attr_idx > state->policies[state->policy_idx].maxtype) {
+		state->attr_idx = 0;
+		state->policy_idx++;
+	}
+
+	if (again) {
+		if (netlink_policy_dump_finished(state))
+			return -ENODATA;
+		goto send_attribute;
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, policy);
+	return -ENOBUFS;
+}
-- 
cgit v1.2.3


From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 30 Apr 2020 22:21:31 +0300
Subject: net/core: Introduce netdev_get_xmit_slave

Add new ndo to get the xmit slave of master device. The reference
counters are not incremented so the caller must be careful with locks.
User can ask to get the xmit slave assume all the slaves can
transmit by set all_slaves arg to true.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/netdevice.h | 12 ++++++++++++
 net/core/dev.c            | 22 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 130a668049ab..26bc0f11b7ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1146,6 +1146,12 @@ struct netdev_net_notifier {
  * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
  *	Called to release previously enslaved netdev.
  *
+ * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
+ *					    struct sk_buff *skb,
+ *					    bool all_slaves);
+ *	Get the xmit slave of master device. If all_slaves is true, function
+ *	assume all the slaves can transmit.
+ *
  *      Feature/offload setting functions.
  * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
  *		netdev_features_t features);
@@ -1389,6 +1395,9 @@ struct net_device_ops {
 						 struct netlink_ext_ack *extack);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
+	struct net_device*	(*ndo_get_xmit_slave)(struct net_device *dev,
+						      struct sk_buff *skb,
+						      bool all_slaves);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
 						    netdev_features_t features);
 	int			(*ndo_set_features)(struct net_device *dev,
@@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev);
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+					 struct sk_buff *skb,
+					 bool all_slaves);
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c9e763bfe0e..e6c10980abfd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
 
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+					 struct sk_buff *skb,
+					 bool all_slaves)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_get_xmit_slave)
+		return NULL;
+	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
+
 static void netdev_adjacent_add_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
-- 
cgit v1.2.3


From 883780af72090daf9ab53779a3085a6ddfc468ca Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 1 May 2020 16:44:27 +0200
Subject: docs: networking: convert x25-iface.txt to ReST

Not much to be done here:

- add SPDX header;
- adjust title markup;
- remove a tail whitespace;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst     |   1 +
 Documentation/networking/x25-iface.rst | 129 +++++++++++++++++++++++++++++++++
 Documentation/networking/x25-iface.txt | 123 -------------------------------
 include/uapi/linux/if_x25.h            |   2 +-
 net/x25/Kconfig                        |   2 +-
 5 files changed, 132 insertions(+), 125 deletions(-)
 create mode 100644 Documentation/networking/x25-iface.rst
 delete mode 100644 Documentation/networking/x25-iface.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index a72fdfb391b6..7a4bdbc111b0 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -115,6 +115,7 @@ Contents:
    udplite
    vrf
    vxlan
+   x25-iface
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/x25-iface.rst b/Documentation/networking/x25-iface.rst
new file mode 100644
index 000000000000..df401891dce6
--- /dev/null
+++ b/Documentation/networking/x25-iface.rst
@@ -0,0 +1,129 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================-
+X.25 Device Driver Interface
+============================-
+
+Version 1.1
+
+			   Jonathan Naylor 26.12.96
+
+This is a description of the messages to be passed between the X.25 Packet
+Layer and the X.25 device driver. They are designed to allow for the easy
+setting of the LAPB mode from within the Packet Layer.
+
+The X.25 device driver will be coded normally as per the Linux device driver
+standards. Most X.25 device drivers will be moderately similar to the
+already existing Ethernet device drivers. However unlike those drivers, the
+X.25 device driver has a state associated with it, and this information
+needs to be passed to and from the Packet Layer for proper operation.
+
+All messages are held in sk_buff's just like real data to be transmitted
+over the LAPB link. The first byte of the skbuff indicates the meaning of
+the rest of the skbuff, if any more information does exist.
+
+
+Packet Layer to Device Driver
+-----------------------------
+
+First Byte = 0x00 (X25_IFACE_DATA)
+
+This indicates that the rest of the skbuff contains data to be transmitted
+over the LAPB link. The LAPB link should already exist before any data is
+passed down.
+
+First Byte = 0x01 (X25_IFACE_CONNECT)
+
+Establish the LAPB link. If the link is already established then the connect
+confirmation message should be returned as soon as possible.
+
+First Byte = 0x02 (X25_IFACE_DISCONNECT)
+
+Terminate the LAPB link. If it is already disconnected then the disconnect
+confirmation message should be returned as soon as possible.
+
+First Byte = 0x03 (X25_IFACE_PARAMS)
+
+LAPB parameters. To be defined.
+
+
+Device Driver to Packet Layer
+-----------------------------
+
+First Byte = 0x00 (X25_IFACE_DATA)
+
+This indicates that the rest of the skbuff contains data that has been
+received over the LAPB link.
+
+First Byte = 0x01 (X25_IFACE_CONNECT)
+
+LAPB link has been established. The same message is used for both a LAPB
+link connect_confirmation and a connect_indication.
+
+First Byte = 0x02 (X25_IFACE_DISCONNECT)
+
+LAPB link has been terminated. This same message is used for both a LAPB
+link disconnect_confirmation and a disconnect_indication.
+
+First Byte = 0x03 (X25_IFACE_PARAMS)
+
+LAPB parameters. To be defined.
+
+
+
+Possible Problems
+=================
+
+(Henner Eisen, 2000-10-28)
+
+The X.25 packet layer protocol depends on a reliable datalink service.
+The LAPB protocol provides such reliable service. But this reliability
+is not preserved by the Linux network device driver interface:
+
+- With Linux 2.4.x (and above) SMP kernels, packet ordering is not
+  preserved. Even if a device driver calls netif_rx(skb1) and later
+  netif_rx(skb2), skb2 might be delivered to the network layer
+  earlier that skb1.
+- Data passed upstream by means of netif_rx() might be dropped by the
+  kernel if the backlog queue is congested.
+
+The X.25 packet layer protocol will detect this and reset the virtual
+call in question. But many upper layer protocols are not designed to
+handle such N-Reset events gracefully. And frequent N-Reset events
+will always degrade performance.
+
+Thus, driver authors should make netif_rx() as reliable as possible:
+
+SMP re-ordering will not occur if the driver's interrupt handler is
+always executed on the same CPU. Thus,
+
+- Driver authors should use irq affinity for the interrupt handler.
+
+The probability of packet loss due to backlog congestion can be
+reduced by the following measures or a combination thereof:
+
+(1) Drivers for kernel versions 2.4.x and above should always check the
+    return value of netif_rx(). If it returns NET_RX_DROP, the
+    driver's LAPB protocol must not confirm reception of the frame
+    to the peer.
+    This will reliably suppress packet loss. The LAPB protocol will
+    automatically cause the peer to re-transmit the dropped packet
+    later.
+    The lapb module interface was modified to support this. Its
+    data_indication() method should now transparently pass the
+    netif_rx() return value to the (lapb module) caller.
+(2) Drivers for kernel versions 2.2.x should always check the global
+    variable netdev_dropping when a new frame is received. The driver
+    should only call netif_rx() if netdev_dropping is zero. Otherwise
+    the driver should not confirm delivery of the frame and drop it.
+    Alternatively, the driver can queue the frame internally and call
+    netif_rx() later when netif_dropping is 0 again. In that case, delivery
+    confirmation should also be deferred such that the internal queue
+    cannot grow to much.
+    This will not reliably avoid packet loss, but the probability
+    of packet loss in netif_rx() path will be significantly reduced.
+(3) Additionally, driver authors might consider to support
+    CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up
+    when a previously congested backlog queue becomes empty again.
+    The driver could uses this for flow-controlling the peer by means
+    of the LAPB protocol's flow-control service.
diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt
deleted file mode 100644
index 7f213b556e85..000000000000
--- a/Documentation/networking/x25-iface.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-			X.25 Device Driver Interface 1.1
-
-			   Jonathan Naylor 26.12.96
-
-This is a description of the messages to be passed between the X.25 Packet
-Layer and the X.25 device driver. They are designed to allow for the easy
-setting of the LAPB mode from within the Packet Layer.
-
-The X.25 device driver will be coded normally as per the Linux device driver
-standards. Most X.25 device drivers will be moderately similar to the
-already existing Ethernet device drivers. However unlike those drivers, the
-X.25 device driver has a state associated with it, and this information
-needs to be passed to and from the Packet Layer for proper operation.
-
-All messages are held in sk_buff's just like real data to be transmitted
-over the LAPB link. The first byte of the skbuff indicates the meaning of
-the rest of the skbuff, if any more information does exist.
-
-
-Packet Layer to Device Driver
------------------------------
-
-First Byte = 0x00 (X25_IFACE_DATA)
-
-This indicates that the rest of the skbuff contains data to be transmitted
-over the LAPB link. The LAPB link should already exist before any data is
-passed down.
-
-First Byte = 0x01 (X25_IFACE_CONNECT)
-
-Establish the LAPB link. If the link is already established then the connect
-confirmation message should be returned as soon as possible.
-
-First Byte = 0x02 (X25_IFACE_DISCONNECT)
-
-Terminate the LAPB link. If it is already disconnected then the disconnect
-confirmation message should be returned as soon as possible.
-
-First Byte = 0x03 (X25_IFACE_PARAMS)
-
-LAPB parameters. To be defined.
-
-
-Device Driver to Packet Layer
------------------------------
-
-First Byte = 0x00 (X25_IFACE_DATA)
-
-This indicates that the rest of the skbuff contains data that has been
-received over the LAPB link.
-
-First Byte = 0x01 (X25_IFACE_CONNECT)
-
-LAPB link has been established. The same message is used for both a LAPB
-link connect_confirmation and a connect_indication.
-
-First Byte = 0x02 (X25_IFACE_DISCONNECT)
-
-LAPB link has been terminated. This same message is used for both a LAPB
-link disconnect_confirmation and a disconnect_indication.
-
-First Byte = 0x03 (X25_IFACE_PARAMS)
-
-LAPB parameters. To be defined.
-
-
-
-Possible Problems
-=================
-
-(Henner Eisen, 2000-10-28)
-
-The X.25 packet layer protocol depends on a reliable datalink service.
-The LAPB protocol provides such reliable service. But this reliability
-is not preserved by the Linux network device driver interface:
-
-- With Linux 2.4.x (and above) SMP kernels, packet ordering is not
-  preserved. Even if a device driver calls netif_rx(skb1) and later
-  netif_rx(skb2), skb2 might be delivered to the network layer
-  earlier that skb1.
-- Data passed upstream by means of netif_rx() might be dropped by the
-  kernel if the backlog queue is congested.
-
-The X.25 packet layer protocol will detect this and reset the virtual
-call in question. But many upper layer protocols are not designed to
-handle such N-Reset events gracefully. And frequent N-Reset events
-will always degrade performance.
-
-Thus, driver authors should make netif_rx() as reliable as possible:
-
-SMP re-ordering will not occur if the driver's interrupt handler is
-always executed on the same CPU. Thus,
-
-- Driver authors should use irq affinity for the interrupt handler.
-
-The probability of packet loss due to backlog congestion can be
-reduced by the following measures or a combination thereof:
-
-(1) Drivers for kernel versions 2.4.x and above should always check the
-    return value of netif_rx(). If it returns NET_RX_DROP, the
-    driver's LAPB protocol must not confirm reception of the frame
-    to the peer. 
-    This will reliably suppress packet loss. The LAPB protocol will
-    automatically cause the peer to re-transmit the dropped packet
-    later.
-    The lapb module interface was modified to support this. Its
-    data_indication() method should now transparently pass the
-    netif_rx() return value to the (lapb module) caller.
-(2) Drivers for kernel versions 2.2.x should always check the global
-    variable netdev_dropping when a new frame is received. The driver
-    should only call netif_rx() if netdev_dropping is zero. Otherwise
-    the driver should not confirm delivery of the frame and drop it.
-    Alternatively, the driver can queue the frame internally and call
-    netif_rx() later when netif_dropping is 0 again. In that case, delivery
-    confirmation should also be deferred such that the internal queue
-    cannot grow to much.
-    This will not reliably avoid packet loss, but the probability
-    of packet loss in netif_rx() path will be significantly reduced.
-(3) Additionally, driver authors might consider to support
-    CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up
-    when a previously congested backlog queue becomes empty again.
-    The driver could uses this for flow-controlling the peer by means
-    of the LAPB protocol's flow-control service.
diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h
index 5d962448345f..3a5938e38370 100644
--- a/include/uapi/linux/if_x25.h
+++ b/include/uapi/linux/if_x25.h
@@ -18,7 +18,7 @@
 
 #include <linux/types.h>
 
-/* Documentation/networking/x25-iface.txt */
+/* Documentation/networking/x25-iface.rst */
 #define X25_IFACE_DATA		0x00
 #define X25_IFACE_CONNECT	0x01
 #define X25_IFACE_DISCONNECT	0x02
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index 2ecb2e5e241e..a328f79885d1 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -21,7 +21,7 @@ config X25
 	  <http://docwiki.cisco.com/wiki/X.25>.
 	  Information about X.25 for Linux is contained in the files
 	  <file:Documentation/networking/x25.txt> and
-	  <file:Documentation/networking/x25-iface.txt>.
+	  <file:Documentation/networking/x25-iface.rst>.
 
 	  One connects to an X.25 network either with a dedicated network card
 	  using the X.21 protocol (not yet supported by Linux) or one can do
-- 
cgit v1.2.3


From c4ea03fdfd122b4ff293bff643c2369852e9cc1c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 1 May 2020 16:44:28 +0200
Subject: docs: networking: convert x25.txt to ReST

Not much to be done here:
- add SPDX header;
- add a document title;
- add to networking/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/index.rst |  1 +
 Documentation/networking/x25.rst   | 48 ++++++++++++++++++++++++++++++++++++++
 Documentation/networking/x25.txt   | 44 ----------------------------------
 net/x25/Kconfig                    |  2 +-
 4 files changed, 50 insertions(+), 45 deletions(-)
 create mode 100644 Documentation/networking/x25.rst
 delete mode 100644 Documentation/networking/x25.txt

(limited to 'net')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 7a4bdbc111b0..75521e6c473b 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -116,6 +116,7 @@ Contents:
    vrf
    vxlan
    x25-iface
+   x25
 
 .. only::  subproject and html
 
diff --git a/Documentation/networking/x25.rst b/Documentation/networking/x25.rst
new file mode 100644
index 000000000000..00e45d384ba0
--- /dev/null
+++ b/Documentation/networking/x25.rst
@@ -0,0 +1,48 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+Linux X.25 Project
+==================
+
+As my third year dissertation at University I have taken it upon myself to
+write an X.25 implementation for Linux. My aim is to provide a complete X.25
+Packet Layer and a LAPB module to allow for "normal" X.25 to be run using
+Linux. There are two sorts of X.25 cards available, intelligent ones that
+implement LAPB on the card itself, and unintelligent ones that simply do
+framing, bit-stuffing and checksumming. These both need to be handled by the
+system.
+
+I therefore decided to write the implementation such that as far as the
+Packet Layer is concerned, the link layer was being performed by a lower
+layer of the Linux kernel and therefore it did not concern itself with
+implementation of LAPB. Therefore the LAPB modules would be called by
+unintelligent X.25 card drivers and not by intelligent ones, this would
+provide a uniform device driver interface, and simplify configuration.
+
+To confuse matters a little, an 802.2 LLC implementation for Linux is being
+written which will allow X.25 to be run over an Ethernet (or Token Ring) and
+conform with the JNT "Pink Book", this will have a different interface to
+the Packet Layer but there will be no confusion since the class of device
+being served by the LLC will be completely separate from LAPB. The LLC
+implementation is being done as part of another protocol project (SNA) and
+by a different author.
+
+Just when you thought that it could not become more confusing, another
+option appeared, XOT. This allows X.25 Packet Layer frames to operate over
+the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the
+format and behaviour of the protocol. If time permits this option will also
+be actively considered.
+
+A linux-x25 mailing list has been created at vger.kernel.org to support the
+development and use of Linux X.25. It is early days yet, but interested
+parties are welcome to subscribe to it. Just send a message to
+majordomo@vger.kernel.org with the following in the message body:
+
+subscribe linux-x25
+end
+
+The contents of the Subject line are ignored.
+
+Jonathan
+
+g4klx@g4klx.demon.co.uk
diff --git a/Documentation/networking/x25.txt b/Documentation/networking/x25.txt
deleted file mode 100644
index c91c6d7159ff..000000000000
--- a/Documentation/networking/x25.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-Linux X.25 Project
-
-As my third year dissertation at University I have taken it upon myself to
-write an X.25 implementation for Linux. My aim is to provide a complete X.25
-Packet Layer and a LAPB module to allow for "normal" X.25 to be run using
-Linux. There are two sorts of X.25 cards available, intelligent ones that
-implement LAPB on the card itself, and unintelligent ones that simply do
-framing, bit-stuffing and checksumming. These both need to be handled by the
-system.
-
-I therefore decided to write the implementation such that as far as the
-Packet Layer is concerned, the link layer was being performed by a lower
-layer of the Linux kernel and therefore it did not concern itself with
-implementation of LAPB. Therefore the LAPB modules would be called by
-unintelligent X.25 card drivers and not by intelligent ones, this would
-provide a uniform device driver interface, and simplify configuration.
-
-To confuse matters a little, an 802.2 LLC implementation for Linux is being
-written which will allow X.25 to be run over an Ethernet (or Token Ring) and
-conform with the JNT "Pink Book", this will have a different interface to
-the Packet Layer but there will be no confusion since the class of device
-being served by the LLC will be completely separate from LAPB. The LLC
-implementation is being done as part of another protocol project (SNA) and
-by a different author.
-
-Just when you thought that it could not become more confusing, another
-option appeared, XOT. This allows X.25 Packet Layer frames to operate over
-the Internet using TCP/IP as a reliable link layer. RFC1613 specifies the
-format and behaviour of the protocol. If time permits this option will also
-be actively considered.
-
-A linux-x25 mailing list has been created at vger.kernel.org to support the
-development and use of Linux X.25. It is early days yet, but interested
-parties are welcome to subscribe to it. Just send a message to
-majordomo@vger.kernel.org with the following in the message body:
-
-subscribe linux-x25
-end
-
-The contents of the Subject line are ignored.
-
-Jonathan
-
-g4klx@g4klx.demon.co.uk
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index a328f79885d1..9f0d58b0b90b 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -20,7 +20,7 @@ config X25
 	  You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and
 	  <http://docwiki.cisco.com/wiki/X.25>.
 	  Information about X.25 for Linux is contained in the files
-	  <file:Documentation/networking/x25.txt> and
+	  <file:Documentation/networking/x25.rst> and
 	  <file:Documentation/networking/x25-iface.rst>.
 
 	  One connects to an X.25 network either with a dedicated network card
-- 
cgit v1.2.3


From beecf11bc2188067824591612151c4dc6ec383c7 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 30 Apr 2020 16:31:52 -0700
Subject: bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr

Currently, bpf_getsockopt and bpf_setsockopt helpers operate on the
'struct bpf_sock_ops' context in BPF_PROG_TYPE_SOCK_OPS program.
Let's generalize them and make them available for 'struct bpf_sock_addr'.
That way, in the future, we can allow those helpers in more places.

As an example, let's expose those 'struct bpf_sock_addr' based helpers to
BPF_CGROUP_INET{4,6}_CONNECT hooks. That way we can override CC before the
connection is made.

v3:
* Expose custom helpers for bpf_sock_addr context instead of doing
  generic bpf_sock argument (as suggested by Daniel). Even with
  try_socket_lock that doesn't sleep we have a problem where context sk
  is already locked and socket lock is non-nestable.

v2:
* s/BPF_PROG_TYPE_CGROUP_SOCKOPT/BPF_PROG_TYPE_SOCK_OPS/

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200430233152.199403-1-sdf@google.com
---
 include/uapi/linux/bpf.h                          |  14 ++-
 net/core/filter.c                                 | 118 +++++++++++++++++-----
 tools/include/uapi/linux/bpf.h                    |  14 ++-
 tools/testing/selftests/bpf/config                |   1 +
 tools/testing/selftests/bpf/progs/connect4_prog.c |  46 +++++++++
 5 files changed, 166 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 705e4822f997..b3643e27e264 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1587,7 +1587,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1595,6 +1595,11 @@ union bpf_attr {
  * 		must be specified, see **setsockopt(2)** for more information.
  * 		The option value of length *optlen* is pointed by *optval*.
  *
+ * 		*bpf_socket* should be one of the following:
+ * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
  *
@@ -1789,7 +1794,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1798,6 +1803,11 @@ union bpf_attr {
  * 		The retrieved value is stored in the structure pointed by
  * 		*opval* and of length *optlen*.
  *
+ * 		*bpf_socket* should be one of the following:
+ * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the following *level*\ s:
  *
diff --git a/net/core/filter.c b/net/core/filter.c
index 70b32723e6be..dfaf5df13722 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4194,16 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
-	   int, level, int, optname, char *, optval, int, optlen)
+#define SOCKOPT_CC_REINIT (1 << 0)
+
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+			   char *optval, int optlen, u32 flags)
 {
-	struct sock *sk = bpf_sock->sk;
 	int ret = 0;
 	int val;
 
 	if (!sk_fullsock(sk))
 		return -EINVAL;
 
+	sock_owned_by_me(sk);
+
 	if (level == SOL_SOCKET) {
 		if (optlen != sizeof(int))
 			return -EINVAL;
@@ -4298,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
 			char name[TCP_CA_NAME_MAX];
-			bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
+			bool reinit = flags & SOCKOPT_CC_REINIT;
 
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
@@ -4345,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 	return ret;
 }
 
-static const struct bpf_func_proto bpf_setsockopt_proto = {
-	.func		= bpf_setsockopt,
-	.gpl_only	= false,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
-};
-
-BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
-	   int, level, int, optname, char *, optval, int, optlen)
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+			   char *optval, int optlen)
 {
-	struct sock *sk = bpf_sock->sk;
-
 	if (!sk_fullsock(sk))
 		goto err_clear;
+
+	sock_owned_by_me(sk);
+
 #ifdef CONFIG_INET
 	if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
 		struct inet_connection_sock *icsk;
@@ -4428,8 +4421,71 @@ err_clear:
 	return -EINVAL;
 }
 
-static const struct bpf_func_proto bpf_getsockopt_proto = {
-	.func		= bpf_getsockopt,
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	u32 flags = 0;
+	return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
+			       flags);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+	.func		= bpf_sock_addr_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+	.func		= bpf_sock_addr_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	u32 flags = 0;
+	if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+		flags |= SOCKOPT_CC_REINIT;
+	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
+			       flags);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+	.func		= bpf_sock_ops_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, level, int, optname, char *, optval, int, optlen)
+{
+	return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+	.func		= bpf_sock_ops_getsockopt,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
@@ -6043,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_storage_get_proto;
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
+	case BPF_FUNC_setsockopt:
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_INET6_CONNECT:
+			return &bpf_sock_addr_setsockopt_proto;
+		default:
+			return NULL;
+		}
+	case BPF_FUNC_getsockopt:
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_INET6_CONNECT:
+			return &bpf_sock_addr_getsockopt_proto;
+		default:
+			return NULL;
+		}
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6261,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
-		return &bpf_setsockopt_proto;
+		return &bpf_sock_ops_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
-		return &bpf_getsockopt_proto;
+		return &bpf_sock_ops_getsockopt_proto;
 	case BPF_FUNC_sock_ops_cb_flags_set:
 		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 705e4822f997..b3643e27e264 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1587,7 +1587,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1595,6 +1595,11 @@ union bpf_attr {
  * 		must be specified, see **setsockopt(2)** for more information.
  * 		The option value of length *optlen* is pointed by *optval*.
  *
+ * 		*bpf_socket* should be one of the following:
+ * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
  *
@@ -1789,7 +1794,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
+ * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1798,6 +1803,11 @@ union bpf_attr {
  * 		The retrieved value is stored in the structure pointed by
  * 		*opval* and of length *optlen*.
  *
+ * 		*bpf_socket* should be one of the following:
+ * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the following *level*\ s:
  *
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 60e3ae5d4e48..6e5b94c036ca 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -37,3 +37,4 @@ CONFIG_IPV6_SIT=m
 CONFIG_BPF_JIT=y
 CONFIG_BPF_LSM=y
 CONFIG_SECURITY=y
+CONFIG_TCP_CONG_DCTCP=y
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index ad3c498a8150..972918cd2d7f 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -8,6 +8,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <sys/socket.h>
+#include <netinet/tcp.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
@@ -16,6 +17,10 @@
 #define DST_REWRITE_IP4		0x7f000001U
 #define DST_REWRITE_PORT4	4444
 
+#ifndef TCP_CA_NAME_MAX
+#define TCP_CA_NAME_MAX 16
+#endif
+
 int _version SEC("version") = 1;
 
 __attribute__ ((noinline))
@@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+static __inline int verify_cc(struct bpf_sock_addr *ctx,
+			      char expected[TCP_CA_NAME_MAX])
+{
+	char buf[TCP_CA_NAME_MAX];
+	int i;
+
+	if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
+		return 1;
+
+	for (i = 0; i < TCP_CA_NAME_MAX; i++) {
+		if (buf[i] != expected[i])
+			return 1;
+		if (buf[i] == 0)
+			break;
+	}
+
+	return 0;
+}
+
+static __inline int set_cc(struct bpf_sock_addr *ctx)
+{
+	char dctcp[TCP_CA_NAME_MAX] = "dctcp";
+	char cubic[TCP_CA_NAME_MAX] = "cubic";
+
+	if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp)))
+		return 1;
+	if (verify_cc(ctx, dctcp))
+		return 1;
+
+	if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic)))
+		return 1;
+	if (verify_cc(ctx, cubic))
+		return 1;
+
+	return 0;
+}
+
 SEC("cgroup/connect4")
 int connect_v4_prog(struct bpf_sock_addr *ctx)
 {
@@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
 
 	bpf_sk_release(sk);
 
+	/* Rewrite congestion control. */
+	if (ctx->type == SOCK_STREAM && set_cc(ctx))
+		return 0;
+
 	/* Rewrite destination. */
 	ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
 	ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
-- 
cgit v1.2.3


From f0628c524fd188c3f9418e12478dfdfadacba815 Mon Sep 17 00:00:00 2001
From: Cambda Zhu <cambda@linux.alibaba.com>
Date: Fri, 24 Apr 2020 16:06:16 +0800
Subject: net: Replace the limit of TCP_LINGER2 with TCP_FIN_TIMEOUT_MAX

This patch changes the behavior of TCP_LINGER2 about its limit. The
sysctl_tcp_fin_timeout used to be the limit of TCP_LINGER2 but now it's
only the default value. A new macro named TCP_FIN_TIMEOUT_MAX is added
as the limit of TCP_LINGER2, which is 2 minutes.

Since TCP_LINGER2 used sysctl_tcp_fin_timeout as the default value
and the limit in the past, the system administrator cannot set the
default value for most of sockets and let some sockets have a greater
timeout. It might be a mistake that let the sysctl to be the limit of
the TCP_LINGER2. Maybe we can add a new sysctl to set the max of
TCP_LINGER2, but FIN-WAIT-2 timeout is usually no need to be too long
and 2 minutes are legal considering TCP specs.

Changes in v3:
- Remove the new socket option and change the TCP_LINGER2 behavior so
  that the timeout can be set to value between sysctl_tcp_fin_timeout
  and 2 minutes.

Changes in v2:
- Add int overflow check for the new socket option.

Changes in v1:
- Add a new socket option to set timeout greater than
  sysctl_tcp_fin_timeout.

Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 net/ipv4/tcp.c    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dcf9a72eeaa6..1beed50522b1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -126,6 +126,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 				  * to combine FIN-WAIT-2 timeout with
 				  * TIME-WAIT timer.
 				  */
+#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
 
 #define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
 #if HZ >= 100
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6d87de434377..8c1250103959 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3035,8 +3035,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_LINGER2:
 		if (val < 0)
 			tp->linger2 = -1;
-		else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
-			tp->linger2 = 0;
+		else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+			tp->linger2 = TCP_FIN_TIMEOUT_MAX;
 		else
 			tp->linger2 = val * HZ;
 		break;
-- 
cgit v1.2.3


From 41a46913bee76d8493681442907ccd989ced2633 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 27 Apr 2020 18:37:43 +0200
Subject: net: fix skb_panic to output real address

In skb_panic() the real pointer values are really needed to diagnose
issues, e.g. data and head are related (to calculate headroom). The
hashed versions of the addresses doesn't make much sense here. The
patch use the printk specifier %px to print the actual address.

The printk documentation on %px:
https://www.kernel.org/doc/html/latest/core-api/printk-formats.html#unmodified-addresses

Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7e29590482ce..1bf0c3d278e7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
 		      const char msg[])
 {
-	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+	pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
 		 msg, addr, skb->len, sz, skb->head, skb->data,
 		 (unsigned long)skb->tail, (unsigned long)skb->end,
 		 skb->dev ? skb->dev->name : "<NULL>");
-- 
cgit v1.2.3


From a51c328df3106663879645680609eb49b3ff6444 Mon Sep 17 00:00:00 2001
From: Po Liu <Po.Liu@nxp.com>
Date: Fri, 1 May 2020 08:53:15 +0800
Subject: net: qos: introduce a gate control flow action

Introduce a ingress frame gate control flow action.
Tc gate action does the work like this:
Assume there is a gate allow specified ingress frames can be passed at
specific time slot, and be dropped at specific time slot. Tc filter
chooses the ingress frames, and tc gate action would specify what slot
does these frames can be passed to device and what time slot would be
dropped.
Tc gate action would provide an entry list to tell how much time gate
keep open and how much time gate keep state close. Gate action also
assign a start time to tell when the entry list start. Then driver would
repeat the gate entry list cyclically.
For the software simulation, gate action requires the user assign a time
clock type.

Below is the setting example in user space. Tc filter a stream source ip
address is 192.168.0.20 and gate action own two time slots. One is last
200ms gate open let frame pass another is last 100ms gate close let
frames dropped. When the ingress frames have reach total frames over
8000000 bytes, the excessive frames will be dropped in that 200000000ns
time slot.

> tc qdisc add dev eth0 ingress

> tc filter add dev eth0 parent ffff: protocol ip \
	   flower src_ip 192.168.0.20 \
	   action gate index 2 clockid CLOCK_TAI \
	   sched-entry open 200000000 -1 8000000 \
	   sched-entry close 100000000 -1 -1

> tc chain del dev eth0 ingress chain 0

"sched-entry" follow the name taprio style. Gate state is
"open"/"close". Follow with period nanosecond. Then next item is internal
priority value means which ingress queue should put. "-1" means
wildcard. The last value optional specifies the maximum number of
MSDU octets that are permitted to pass the gate during the specified
time interval.
Base-time is not set will be 0 as default, as result start time would
be ((N + 1) * cycletime) which is the minimal of future time.

Below example shows filtering a stream with destination mac address is
10:00:80:00:00:00 and ip type is ICMP, follow the action gate. The gate
action would run with one close time slot which means always keep close.
The time cycle is total 200000000ns. The base-time would calculate by:

 1357000000000 + (N + 1) * cycletime

When the total value is the future time, it will be the start time.
The cycletime here would be 200000000ns for this case.

> tc filter add dev eth0 parent ffff:  protocol ip \
	   flower skip_hw ip_proto icmp dst_mac 10:00:80:00:00:00 \
	   action gate index 12 base-time 1357000000000 \
	   sched-entry close 200000000 -1 -1 \
	   clockid CLOCK_TAI

Signed-off-by: Po Liu <Po.Liu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gate.h        |  47 +++
 include/uapi/linux/pkt_cls.h        |   1 +
 include/uapi/linux/tc_act/tc_gate.h |  47 +++
 net/sched/Kconfig                   |  12 +
 net/sched/Makefile                  |   1 +
 net/sched/act_gate.c                | 636 ++++++++++++++++++++++++++++++++++++
 6 files changed, 744 insertions(+)
 create mode 100644 include/net/tc_act/tc_gate.h
 create mode 100644 include/uapi/linux/tc_act/tc_gate.h
 create mode 100644 net/sched/act_gate.c

(limited to 'net')

diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
new file mode 100644
index 000000000000..330ad8b02495
--- /dev/null
+++ b/include/net/tc_act/tc_gate.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2020 NXP */
+
+#ifndef __NET_TC_GATE_H
+#define __NET_TC_GATE_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_gate.h>
+
+struct tcfg_gate_entry {
+	int			index;
+	u8			gate_state;
+	u32			interval;
+	s32			ipv;
+	s32			maxoctets;
+	struct list_head	list;
+};
+
+struct tcf_gate_params {
+	s32			tcfg_priority;
+	u64			tcfg_basetime;
+	u64			tcfg_cycletime;
+	u64			tcfg_cycletime_ext;
+	u32			tcfg_flags;
+	s32			tcfg_clockid;
+	size_t			num_entries;
+	struct list_head	entries;
+};
+
+#define GATE_ACT_GATE_OPEN	BIT(0)
+#define GATE_ACT_PENDING	BIT(1)
+
+struct tcf_gate {
+	struct tc_action	common;
+	struct tcf_gate_params	param;
+	u8			current_gate_status;
+	ktime_t			current_close_time;
+	u32			current_entry_octets;
+	s32			current_max_octets;
+	struct tcfg_gate_entry	*next_entry;
+	struct hrtimer		hitimer;
+	enum tk_offsets		tk_offset;
+};
+
+#define to_gate(a) ((struct tcf_gate *)a)
+
+#endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9f06d29cab70..fc672b232437 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -134,6 +134,7 @@ enum tca_id {
 	TCA_ID_CTINFO,
 	TCA_ID_MPLS,
 	TCA_ID_CT,
+	TCA_ID_GATE,
 	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
diff --git a/include/uapi/linux/tc_act/tc_gate.h b/include/uapi/linux/tc_act/tc_gate.h
new file mode 100644
index 000000000000..f214b3a6d44f
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_gate.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/* Copyright 2020 NXP */
+
+#ifndef __LINUX_TC_GATE_H
+#define __LINUX_TC_GATE_H
+
+#include <linux/pkt_cls.h>
+
+struct tc_gate {
+	tc_gen;
+};
+
+enum {
+	TCA_GATE_ENTRY_UNSPEC,
+	TCA_GATE_ENTRY_INDEX,
+	TCA_GATE_ENTRY_GATE,
+	TCA_GATE_ENTRY_INTERVAL,
+	TCA_GATE_ENTRY_IPV,
+	TCA_GATE_ENTRY_MAX_OCTETS,
+	__TCA_GATE_ENTRY_MAX,
+};
+#define TCA_GATE_ENTRY_MAX (__TCA_GATE_ENTRY_MAX - 1)
+
+enum {
+	TCA_GATE_ONE_ENTRY_UNSPEC,
+	TCA_GATE_ONE_ENTRY,
+	__TCA_GATE_ONE_ENTRY_MAX,
+};
+#define TCA_GATE_ONE_ENTRY_MAX (__TCA_GATE_ONE_ENTRY_MAX - 1)
+
+enum {
+	TCA_GATE_UNSPEC,
+	TCA_GATE_TM,
+	TCA_GATE_PARMS,
+	TCA_GATE_PAD,
+	TCA_GATE_PRIORITY,
+	TCA_GATE_ENTRY_LIST,
+	TCA_GATE_BASE_TIME,
+	TCA_GATE_CYCLE_TIME,
+	TCA_GATE_CYCLE_TIME_EXT,
+	TCA_GATE_FLAGS,
+	TCA_GATE_CLOCKID,
+	__TCA_GATE_MAX,
+};
+#define TCA_GATE_MAX (__TCA_GATE_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index bfbefb7bff9d..2f20073f4f84 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -981,6 +981,18 @@ config NET_ACT_CT
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ct.
 
+config NET_ACT_GATE
+	tristate "Frame gate entry list control tc action"
+	depends on NET_CLS_ACT
+	help
+	  Say Y here to allow to control the ingress flow to be passed at
+	  specific time slot and be dropped at other specific time slot by
+	  the gate entry list.
+
+	  If unsure, say N.
+	  To compile this code as a module, choose M here: the
+	  module will be called act_gate.
+
 config NET_IFE_SKBMARK
 	tristate "Support to encoding decoding skb mark on IFE action"
 	depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 31c367a6cd09..66bbf9a98f9e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
 obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
+obj-$(CONFIG_NET_ACT_GATE)	+= act_gate.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
new file mode 100644
index 000000000000..35fc48795541
--- /dev/null
+++ b/net/sched/act_gate.c
@@ -0,0 +1,636 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2020 NXP */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gate.h>
+
+static unsigned int gate_net_id;
+static struct tc_action_ops act_gate_ops;
+
+static ktime_t gate_get_time(struct tcf_gate *gact)
+{
+	ktime_t mono = ktime_get();
+
+	switch (gact->tk_offset) {
+	case TK_OFFS_MAX:
+		return mono;
+	default:
+		return ktime_mono_to_any(mono, gact->tk_offset);
+	}
+
+	return KTIME_MAX;
+}
+
+static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
+{
+	struct tcf_gate_params *param = &gact->param;
+	ktime_t now, base, cycle;
+	u64 n;
+
+	base = ns_to_ktime(param->tcfg_basetime);
+	now = gate_get_time(gact);
+
+	if (ktime_after(base, now)) {
+		*start = base;
+		return 0;
+	}
+
+	cycle = param->tcfg_cycletime;
+
+	/* cycle time should not be zero */
+	if (!cycle)
+		return -EFAULT;
+
+	n = div64_u64(ktime_sub_ns(now, base), cycle);
+	*start = ktime_add_ns(base, (n + 1) * cycle);
+	return 0;
+}
+
+static void gate_start_timer(struct tcf_gate *gact, ktime_t start)
+{
+	ktime_t expires;
+
+	expires = hrtimer_get_expires(&gact->hitimer);
+	if (expires == 0)
+		expires = KTIME_MAX;
+
+	start = min_t(ktime_t, start, expires);
+
+	hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT);
+}
+
+static enum hrtimer_restart gate_timer_func(struct hrtimer *timer)
+{
+	struct tcf_gate *gact = container_of(timer, struct tcf_gate,
+					     hitimer);
+	struct tcf_gate_params *p = &gact->param;
+	struct tcfg_gate_entry *next;
+	ktime_t close_time, now;
+
+	spin_lock(&gact->tcf_lock);
+
+	next = gact->next_entry;
+
+	/* cycle start, clear pending bit, clear total octets */
+	gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0;
+	gact->current_entry_octets = 0;
+	gact->current_max_octets = next->maxoctets;
+
+	gact->current_close_time = ktime_add_ns(gact->current_close_time,
+						next->interval);
+
+	close_time = gact->current_close_time;
+
+	if (list_is_last(&next->list, &p->entries))
+		next = list_first_entry(&p->entries,
+					struct tcfg_gate_entry, list);
+	else
+		next = list_next_entry(next, list);
+
+	now = gate_get_time(gact);
+
+	if (ktime_after(now, close_time)) {
+		ktime_t cycle, base;
+		u64 n;
+
+		cycle = p->tcfg_cycletime;
+		base = ns_to_ktime(p->tcfg_basetime);
+		n = div64_u64(ktime_sub_ns(now, base), cycle);
+		close_time = ktime_add_ns(base, (n + 1) * cycle);
+	}
+
+	gact->next_entry = next;
+
+	hrtimer_set_expires(&gact->hitimer, close_time);
+
+	spin_unlock(&gact->tcf_lock);
+
+	return HRTIMER_RESTART;
+}
+
+static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_gate *gact = to_gate(a);
+
+	spin_lock(&gact->tcf_lock);
+
+	tcf_lastuse_update(&gact->tcf_tm);
+	bstats_update(&gact->tcf_bstats, skb);
+
+	if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) {
+		spin_unlock(&gact->tcf_lock);
+		return gact->tcf_action;
+	}
+
+	if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN))
+		goto drop;
+
+	if (gact->current_max_octets >= 0) {
+		gact->current_entry_octets += qdisc_pkt_len(skb);
+		if (gact->current_entry_octets > gact->current_max_octets) {
+			gact->tcf_qstats.overlimits++;
+			goto drop;
+		}
+	}
+
+	spin_unlock(&gact->tcf_lock);
+
+	return gact->tcf_action;
+drop:
+	gact->tcf_qstats.drops++;
+	spin_unlock(&gact->tcf_lock);
+
+	return TC_ACT_SHOT;
+}
+
+static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = {
+	[TCA_GATE_ENTRY_INDEX]		= { .type = NLA_U32 },
+	[TCA_GATE_ENTRY_GATE]		= { .type = NLA_FLAG },
+	[TCA_GATE_ENTRY_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_GATE_ENTRY_IPV]		= { .type = NLA_S32 },
+	[TCA_GATE_ENTRY_MAX_OCTETS]	= { .type = NLA_S32 },
+};
+
+static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = {
+	[TCA_GATE_PARMS]		= { .len = sizeof(struct tc_gate),
+					    .type = NLA_EXACT_LEN },
+	[TCA_GATE_PRIORITY]		= { .type = NLA_S32 },
+	[TCA_GATE_ENTRY_LIST]		= { .type = NLA_NESTED },
+	[TCA_GATE_BASE_TIME]		= { .type = NLA_U64 },
+	[TCA_GATE_CYCLE_TIME]		= { .type = NLA_U64 },
+	[TCA_GATE_CYCLE_TIME_EXT]	= { .type = NLA_U64 },
+	[TCA_GATE_FLAGS]		= { .type = NLA_U32 },
+	[TCA_GATE_CLOCKID]		= { .type = NLA_S32 },
+};
+
+static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry,
+			   struct netlink_ext_ack *extack)
+{
+	u32 interval = 0;
+
+	entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]);
+
+	if (tb[TCA_GATE_ENTRY_INTERVAL])
+		interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]);
+
+	if (interval == 0) {
+		NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+		return -EINVAL;
+	}
+
+	entry->interval = interval;
+
+	if (tb[TCA_GATE_ENTRY_IPV])
+		entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]);
+	else
+		entry->ipv = -1;
+
+	if (tb[TCA_GATE_ENTRY_MAX_OCTETS])
+		entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]);
+	else
+		entry->maxoctets = -1;
+
+	return 0;
+}
+
+static int parse_gate_entry(struct nlattr *n, struct  tcfg_gate_entry *entry,
+			    int index, struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { };
+	int err;
+
+	err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+		return -EINVAL;
+	}
+
+	entry->index = index;
+
+	return fill_gate_entry(tb, entry, extack);
+}
+
+static void release_entry_list(struct list_head *entries)
+{
+	struct tcfg_gate_entry *entry, *e;
+
+	list_for_each_entry_safe(entry, e, entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+static int parse_gate_list(struct nlattr *list_attr,
+			   struct tcf_gate_params *sched,
+			   struct netlink_ext_ack *extack)
+{
+	struct tcfg_gate_entry *entry;
+	struct nlattr *n;
+	int err, rem;
+	int i = 0;
+
+	if (!list_attr)
+		return -EINVAL;
+
+	nla_for_each_nested(n, list_attr, rem) {
+		if (nla_type(n) != TCA_GATE_ONE_ENTRY) {
+			NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'");
+			continue;
+		}
+
+		entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+		if (!entry) {
+			NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+			err = -ENOMEM;
+			goto release_list;
+		}
+
+		err = parse_gate_entry(n, entry, i, extack);
+		if (err < 0) {
+			kfree(entry);
+			goto release_list;
+		}
+
+		list_add_tail(&entry->list, &sched->entries);
+		i++;
+	}
+
+	sched->num_entries = i;
+
+	return i;
+
+release_list:
+	release_entry_list(&sched->entries);
+
+	return err;
+}
+
+static int tcf_gate_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action **a,
+			 int ovr, int bind, bool rtnl_held,
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, gate_net_id);
+	enum tk_offsets tk_offset = TK_OFFS_TAI;
+	struct nlattr *tb[TCA_GATE_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tcf_gate_params *p;
+	s32 clockid = CLOCK_TAI;
+	struct tcf_gate *gact;
+	struct tc_gate *parm;
+	int ret = 0, err;
+	u64 basetime = 0;
+	u32 gflags = 0;
+	s32 prio = -1;
+	ktime_t start;
+	u32 index;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_GATE_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_GATE_PARMS]);
+	index = parm->index;
+
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
+	if (err < 0)
+		return err;
+
+	if (err && bind)
+		return 0;
+
+	if (!err) {
+		ret = tcf_idr_create(tn, index, est, a,
+				     &act_gate_ops, bind, false, 0);
+		if (ret) {
+			tcf_idr_cleanup(tn, index);
+			return ret;
+		}
+
+		ret = ACT_P_CREATED;
+	} else if (!ovr) {
+		tcf_idr_release(*a, bind);
+		return -EEXIST;
+	}
+
+	if (tb[TCA_GATE_PRIORITY])
+		prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
+
+	if (tb[TCA_GATE_BASE_TIME])
+		basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]);
+
+	if (tb[TCA_GATE_FLAGS])
+		gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
+
+	if (tb[TCA_GATE_CLOCKID]) {
+		clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
+		switch (clockid) {
+		case CLOCK_REALTIME:
+			tk_offset = TK_OFFS_REAL;
+			break;
+		case CLOCK_MONOTONIC:
+			tk_offset = TK_OFFS_MAX;
+			break;
+		case CLOCK_BOOTTIME:
+			tk_offset = TK_OFFS_BOOT;
+			break;
+		case CLOCK_TAI:
+			tk_offset = TK_OFFS_TAI;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+			goto release_idr;
+		}
+	}
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	gact = to_gate(*a);
+
+	spin_lock_bh(&gact->tcf_lock);
+	p = &gact->param;
+
+	if (tb[TCA_GATE_CYCLE_TIME]) {
+		p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
+		if (!p->tcfg_cycletime_ext)
+			goto chain_put;
+	}
+
+	INIT_LIST_HEAD(&p->entries);
+	if (tb[TCA_GATE_ENTRY_LIST]) {
+		err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
+		if (err < 0)
+			goto chain_put;
+	}
+
+	if (!p->tcfg_cycletime) {
+		struct tcfg_gate_entry *entry;
+		ktime_t cycle = 0;
+
+		list_for_each_entry(entry, &p->entries, list)
+			cycle = ktime_add_ns(cycle, entry->interval);
+		p->tcfg_cycletime = cycle;
+	}
+
+	if (tb[TCA_GATE_CYCLE_TIME_EXT])
+		p->tcfg_cycletime_ext =
+			nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
+
+	p->tcfg_priority = prio;
+	p->tcfg_basetime = basetime;
+	p->tcfg_clockid = clockid;
+	p->tcfg_flags = gflags;
+
+	gact->tk_offset = tk_offset;
+	hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
+	gact->hitimer.function = gate_timer_func;
+
+	err = gate_get_start_time(gact, &start);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack,
+			       "Internal error: failed get start time");
+		release_entry_list(&p->entries);
+		goto chain_put;
+	}
+
+	gact->current_close_time = start;
+	gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
+
+	gact->next_entry = list_first_entry(&p->entries,
+					    struct tcfg_gate_entry, list);
+
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+
+	gate_start_timer(gact, start);
+
+	spin_unlock_bh(&gact->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
+
+	return ret;
+
+chain_put:
+	spin_unlock_bh(&gact->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+static void tcf_gate_cleanup(struct tc_action *a)
+{
+	struct tcf_gate *gact = to_gate(a);
+	struct tcf_gate_params *p;
+
+	hrtimer_cancel(&gact->hitimer);
+
+	p = &gact->param;
+
+	release_entry_list(&p->entries);
+}
+
+static int dumping_entry(struct sk_buff *skb,
+			 struct tcfg_gate_entry *entry)
+{
+	struct nlattr *item;
+
+	item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY);
+	if (!item)
+		return -ENOSPC;
+
+	if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index))
+		goto nla_put_failure;
+
+	if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval))
+		goto nla_put_failure;
+
+	if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets))
+		goto nla_put_failure;
+
+	if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, item);
+
+nla_put_failure:
+	nla_nest_cancel(skb, item);
+	return -1;
+}
+
+static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_gate *gact = to_gate(a);
+	struct tc_gate opt = {
+		.index    = gact->tcf_index,
+		.refcnt   = refcount_read(&gact->tcf_refcnt) - ref,
+		.bindcnt  = atomic_read(&gact->tcf_bindcnt) - bind,
+	};
+	struct tcfg_gate_entry *entry;
+	struct tcf_gate_params *p;
+	struct nlattr *entry_list;
+	struct tcf_t t;
+
+	spin_lock_bh(&gact->tcf_lock);
+	opt.action = gact->tcf_action;
+
+	p = &gact->param;
+
+	if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME,
+			      p->tcfg_basetime, TCA_GATE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME,
+			      p->tcfg_cycletime, TCA_GATE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT,
+			      p->tcfg_cycletime_ext, TCA_GATE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags))
+		goto nla_put_failure;
+
+	if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority))
+		goto nla_put_failure;
+
+	entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST);
+	if (!entry_list)
+		goto nla_put_failure;
+
+	list_for_each_entry(entry, &p->entries, list) {
+		if (dumping_entry(skb, entry) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, entry_list);
+
+	tcf_tm_dump(&t, &gact->tcf_tm);
+	if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD))
+		goto nla_put_failure;
+	spin_unlock_bh(&gact->tcf_lock);
+
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&gact->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_gate_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+				  u64 lastuse, bool hw)
+{
+	struct tcf_gate *gact = to_gate(a);
+	struct tcf_t *tm = &gact->tcf_tm;
+
+	tcf_action_update_stats(a, bytes, packets, false, hw);
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static size_t tcf_gate_get_fill_size(const struct tc_action *act)
+{
+	return nla_total_size(sizeof(struct tc_gate));
+}
+
+static struct tc_action_ops act_gate_ops = {
+	.kind		=	"gate",
+	.id		=	TCA_ID_GATE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_gate_act,
+	.dump		=	tcf_gate_dump,
+	.init		=	tcf_gate_init,
+	.cleanup	=	tcf_gate_cleanup,
+	.walk		=	tcf_gate_walker,
+	.stats_update	=	tcf_gate_stats_update,
+	.get_fill_size	=	tcf_gate_get_fill_size,
+	.lookup		=	tcf_gate_search,
+	.size		=	sizeof(struct tcf_gate),
+};
+
+static __net_init int gate_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+	return tc_action_net_init(net, tn, &act_gate_ops);
+}
+
+static void __net_exit gate_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, gate_net_id);
+}
+
+static struct pernet_operations gate_net_ops = {
+	.init = gate_init_net,
+	.exit_batch = gate_exit_net,
+	.id   = &gate_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init gate_init_module(void)
+{
+	return tcf_register_action(&act_gate_ops, &gate_net_ops);
+}
+
+static void __exit gate_cleanup_module(void)
+{
+	tcf_unregister_action(&act_gate_ops, &gate_net_ops);
+}
+
+module_init(gate_init_module);
+module_exit(gate_cleanup_module);
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From d29bdd69ecdd70e8e3c2268fc8e188d6ab55e54a Mon Sep 17 00:00:00 2001
From: Po Liu <Po.Liu@nxp.com>
Date: Fri, 1 May 2020 08:53:16 +0800
Subject: net: schedule: add action gate offloading

Add the gate action to the flow action entry. Add the gate parameters to
the tc_setup_flow_action() queueing to the entries of flow_action_entry
array provide to the driver.

Signed-off-by: Po Liu <Po.Liu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h   | 10 +++++
 include/net/tc_act/tc_gate.h | 99 ++++++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_api.c          | 33 +++++++++++++++
 3 files changed, 142 insertions(+)

(limited to 'net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 3619c6acf60f..94a30fe02e6d 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -147,6 +147,7 @@ enum flow_action_id {
 	FLOW_ACTION_MPLS_PUSH,
 	FLOW_ACTION_MPLS_POP,
 	FLOW_ACTION_MPLS_MANGLE,
+	FLOW_ACTION_GATE,
 	NUM_FLOW_ACTIONS,
 };
 
@@ -255,6 +256,15 @@ struct flow_action_entry {
 			u8		bos;
 			u8		ttl;
 		} mpls_mangle;
+		struct {
+			u32		index;
+			s32		prio;
+			u64		basetime;
+			u64		cycletime;
+			u64		cycletimeext;
+			u32		num_entries;
+			struct action_gate_entry *entries;
+		} gate;
 	};
 	struct flow_action_cookie *cookie; /* user defined action cookie */
 };
diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index 330ad8b02495..8bc6be81a7ad 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -7,6 +7,13 @@
 #include <net/act_api.h>
 #include <linux/tc_act/tc_gate.h>
 
+struct action_gate_entry {
+	u8			gate_state;
+	u32			interval;
+	s32			ipv;
+	s32			maxoctets;
+};
+
 struct tcfg_gate_entry {
 	int			index;
 	u8			gate_state;
@@ -44,4 +51,96 @@ struct tcf_gate {
 
 #define to_gate(a) ((struct tcf_gate *)a)
 
+static inline bool is_tcf_gate(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->id == TCA_ID_GATE)
+		return true;
+#endif
+	return false;
+}
+
+static inline u32 tcf_gate_index(const struct tc_action *a)
+{
+	return a->tcfa_index;
+}
+
+static inline s32 tcf_gate_prio(const struct tc_action *a)
+{
+	s32 tcfg_prio;
+
+	tcfg_prio = to_gate(a)->param.tcfg_priority;
+
+	return tcfg_prio;
+}
+
+static inline u64 tcf_gate_basetime(const struct tc_action *a)
+{
+	u64 tcfg_basetime;
+
+	tcfg_basetime = to_gate(a)->param.tcfg_basetime;
+
+	return tcfg_basetime;
+}
+
+static inline u64 tcf_gate_cycletime(const struct tc_action *a)
+{
+	u64 tcfg_cycletime;
+
+	tcfg_cycletime = to_gate(a)->param.tcfg_cycletime;
+
+	return tcfg_cycletime;
+}
+
+static inline u64 tcf_gate_cycletimeext(const struct tc_action *a)
+{
+	u64 tcfg_cycletimeext;
+
+	tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext;
+
+	return tcfg_cycletimeext;
+}
+
+static inline u32 tcf_gate_num_entries(const struct tc_action *a)
+{
+	u32 num_entries;
+
+	num_entries = to_gate(a)->param.num_entries;
+
+	return num_entries;
+}
+
+static inline struct action_gate_entry
+			*tcf_gate_get_list(const struct tc_action *a)
+{
+	struct action_gate_entry *oe;
+	struct tcf_gate_params *p;
+	struct tcfg_gate_entry *entry;
+	u32 num_entries;
+	int i = 0;
+
+	p = &to_gate(a)->param;
+	num_entries = p->num_entries;
+
+	list_for_each_entry(entry, &p->entries, list)
+		i++;
+
+	if (i != num_entries)
+		return NULL;
+
+	oe = kcalloc(num_entries, sizeof(*oe), GFP_ATOMIC);
+	if (!oe)
+		return NULL;
+
+	i = 0;
+	list_for_each_entry(entry, &p->entries, list) {
+		oe[i].gate_state = entry->gate_state;
+		oe[i].interval = entry->interval;
+		oe[i].ipv = entry->ipv;
+		oe[i].maxoctets = entry->maxoctets;
+		i++;
+	}
+
+	return oe;
+}
 #endif
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 11b683c45c28..7e85c91d0752 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -39,6 +39,7 @@
 #include <net/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_ct.h>
 #include <net/tc_act/tc_mpls.h>
+#include <net/tc_act/tc_gate.h>
 #include <net/flow_offload.h>
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -3526,6 +3527,27 @@ static void tcf_sample_get_group(struct flow_action_entry *entry,
 #endif
 }
 
+static void tcf_gate_entry_destructor(void *priv)
+{
+	struct action_gate_entry *oe = priv;
+
+	kfree(oe);
+}
+
+static int tcf_gate_get_entries(struct flow_action_entry *entry,
+				const struct tc_action *act)
+{
+	entry->gate.entries = tcf_gate_get_list(act);
+
+	if (!entry->gate.entries)
+		return -EINVAL;
+
+	entry->destructor = tcf_gate_entry_destructor;
+	entry->destructor_priv = entry->gate.entries;
+
+	return 0;
+}
+
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts)
 {
@@ -3672,6 +3694,17 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 		} else if (is_tcf_skbedit_priority(act)) {
 			entry->id = FLOW_ACTION_PRIORITY;
 			entry->priority = tcf_skbedit_priority(act);
+		} else if (is_tcf_gate(act)) {
+			entry->id = FLOW_ACTION_GATE;
+			entry->gate.index = tcf_gate_index(act);
+			entry->gate.prio = tcf_gate_prio(act);
+			entry->gate.basetime = tcf_gate_basetime(act);
+			entry->gate.cycletime = tcf_gate_cycletime(act);
+			entry->gate.cycletimeext = tcf_gate_cycletimeext(act);
+			entry->gate.num_entries = tcf_gate_num_entries(act);
+			err = tcf_gate_get_entries(entry, act);
+			if (err)
+				goto err_out;
 		} else {
 			err = -EOPNOTSUPP;
 			goto err_out_locked;
-- 
cgit v1.2.3


From 7562a13d5a8ce9bc5020705da5f50221021f5a2c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:01 +0200
Subject: net/smc: multiple link support for rmb buffer registration

The CONFIRM_RKEY LLC processing handles all links in one LLC message.
Move the call to this processing out of smcr_link_reg_rmb() which does
processing per link, into smcr_lgr_reg_rmbs() which is responsible for
link group level processing. Move smcr_link_reg_rmb() into module
smc_core.c.
>From af_smc.c now call smcr_lgr_reg_rmbs() to register new rmbs on all
available links.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 54 +++++++++++++++++++-----------------------------------
 net/smc/smc_core.c | 16 ++++++++++++++++
 net/smc/smc_core.h |  1 +
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index bd9662d06896..20d6d3fbb86c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -337,46 +337,30 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
-/* register a new rmb, send confirm_rkey msg to register with peer */
-static int smcr_link_reg_rmb(struct smc_link *link,
-			     struct smc_buf_desc *rmb_desc, bool conf_rkey)
-{
-	if (!rmb_desc->is_reg_mr[link->link_idx]) {
-		/* register memory region for new rmb */
-		if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
-			rmb_desc->is_reg_err = true;
-			return -EFAULT;
-		}
-		rmb_desc->is_reg_mr[link->link_idx] = true;
-	}
-	if (!conf_rkey)
-		return 0;
-
-	/* exchange confirm_rkey msg with peer */
-	if (!rmb_desc->is_conf_rkey) {
-		if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
-			rmb_desc->is_reg_err = true;
-			return -EFAULT;
-		}
-		rmb_desc->is_conf_rkey = true;
-	}
-	return 0;
-}
-
 /* register the new rmb on all links */
-static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr,
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
 			     struct smc_buf_desc *rmb_desc)
 {
-	int i, rc;
+	struct smc_link_group *lgr = link->lgr;
+	int i, rc = 0;
 
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 		if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
 			continue;
-		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true);
+		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
 		if (rc)
-			return rc;
+			goto out;
 	}
-	return 0;
+
+	/* exchange confirm_rkey msg with peer */
+	rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+	if (rc) {
+		rc = -EFAULT;
+		goto out;
+	}
+	rmb_desc->is_conf_rkey = true;
+out:
+	return rc;
 }
 
 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
@@ -408,7 +392,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 
 	smc_wr_remember_qp_attr(link);
 
-	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* confirm_rkey is implicit on 1st contact */
@@ -670,7 +654,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
 						 ini->cln_first_contact);
 	} else {
-		if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc))
+		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc))
 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
 						 ini->cln_first_contact);
 	}
@@ -1045,7 +1029,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 
 	link->lgr->type = SMC_LGR_SINGLE;
 
-	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -1220,7 +1204,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
 	struct smc_connection *conn = &new_smc->conn;
 
 	if (local_contact != SMC_FIRST_CONTACT) {
-		if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc))
+		if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
 			return SMC_CLC_DECL_ERR_REGRMB;
 	}
 	smc_rmb_sync_sg_for_device(&new_smc->conn);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 3539ceef9a97..de6bc36fe9a7 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1127,6 +1127,22 @@ free_table:
 	return rc;
 }
 
+/* register a new rmb on IB device */
+int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+{
+	if (list_empty(&link->lgr->list))
+		return -ENOLINK;
+	if (!rmb_desc->is_reg_mr[link->link_idx]) {
+		/* register memory region for new rmb */
+		if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
+			rmb_desc->is_reg_err = true;
+			return -EFAULT;
+		}
+		rmb_desc->is_reg_mr[link->link_idx] = true;
+	}
+	return 0;
+}
+
 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 						bool is_rmb, int bufsize)
 {
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index f12474cc666c..fd512188d2c6 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -367,6 +367,7 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
 int smc_core_init(void);
 void smc_core_exit(void);
 
+int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
 	return link->lgr;
-- 
cgit v1.2.3


From 4a3641c160873fe6b6bcff00a6ea15e7430d8d42 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:02 +0200
Subject: net/smc: unmapping of buffers to support multiple links

With the support of multiple links that are created and cleared there
is a need to unmap one link from all current buffers. Add unmapping by
link and by rmb. And make smcr_link_clear() available to be called from
the LLC layer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 76 ++++++++++++++++++++++++++++++++++++++++++------------
 net/smc/smc_core.h |  1 +
 2 files changed, 60 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index de6bc36fe9a7..d5ecea490b4e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -498,14 +498,69 @@ void smc_conn_free(struct smc_connection *conn)
 		smc_lgr_schedule_free_work(lgr);
 }
 
-static void smcr_link_clear(struct smc_link *lnk)
+/* unregister a link from a buf_desc */
+static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+				struct smc_link *lnk)
+{
+	if (is_rmb)
+		buf_desc->is_reg_mr[lnk->link_idx] = false;
+	if (!buf_desc->is_map_ib[lnk->link_idx])
+		return;
+	if (is_rmb) {
+		if (buf_desc->mr_rx[lnk->link_idx]) {
+			smc_ib_put_memory_region(
+					buf_desc->mr_rx[lnk->link_idx]);
+			buf_desc->mr_rx[lnk->link_idx] = NULL;
+		}
+		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
+	} else {
+		smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+	}
+	sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+	buf_desc->is_map_ib[lnk->link_idx] = false;
+}
+
+/* unmap all buffers of lgr for a deleted link */
+static void smcr_buf_unmap_lgr(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+	struct smc_buf_desc *buf_desc, *bf;
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		mutex_lock(&lgr->rmbs_lock);
+		list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
+			smcr_buf_unmap_link(buf_desc, true, lnk);
+		mutex_unlock(&lgr->rmbs_lock);
+		mutex_lock(&lgr->sndbufs_lock);
+		list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
+					 list)
+			smcr_buf_unmap_link(buf_desc, false, lnk);
+		mutex_unlock(&lgr->sndbufs_lock);
+	}
+}
+
+static void smcr_rtoken_clear_link(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+	int i;
+
+	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+		lgr->rtokens[i][lnk->link_idx].rkey = 0;
+		lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
+	}
+}
+
+void smcr_link_clear(struct smc_link *lnk)
 {
 	struct smc_ib_device *smcibdev;
 
-	if (lnk->peer_qpn == 0)
+	if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED)
 		return;
 	lnk->peer_qpn = 0;
 	smc_llc_link_clear(lnk);
+	smcr_buf_unmap_lgr(lnk);
+	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_reset(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
@@ -522,23 +577,10 @@ static void smcr_link_clear(struct smc_link *lnk)
 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
 			  struct smc_buf_desc *buf_desc)
 {
-	struct smc_link *lnk;
 	int i;
 
-	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-		lnk = &lgr->lnk[i];
-		if (!buf_desc->is_map_ib[lnk->link_idx])
-			continue;
-		if (is_rmb) {
-			if (buf_desc->mr_rx[lnk->link_idx])
-				smc_ib_put_memory_region(
-						buf_desc->mr_rx[lnk->link_idx]);
-			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
-		} else {
-			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
-		}
-		sg_free_table(&buf_desc->sgt[lnk->link_idx]);
-	}
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+		smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
 
 	if (buf_desc->pages)
 		__free_pages(buf_desc->pages, buf_desc->order);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fd512188d2c6..fa532a423fd7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -367,6 +367,7 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
 int smc_core_init(void);
 void smc_core_exit(void);
 
+void smcr_link_clear(struct smc_link *lnk);
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
-- 
cgit v1.2.3


From fb33d27727254618aaf6bc2fedcb0fda1d5c0239 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:03 +0200
Subject: net/smc: map and register buffers for a new link

Introduce support to map and register all current buffers for a new
link. smcr_buf_map_lgr() will map used buffers for a new link and
smcr_buf_reg_lgr() can be called to register used buffers on the
IB device of the new link.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.h |  2 ++
 2 files changed, 62 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d5ecea490b4e..0e87f652caea 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1185,6 +1185,66 @@ int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
 	return 0;
 }
 
+static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
+			     struct list_head *lst, bool is_rmb)
+{
+	struct smc_buf_desc *buf_desc, *bf;
+	int rc = 0;
+
+	mutex_lock(lock);
+	list_for_each_entry_safe(buf_desc, bf, lst, list) {
+		if (!buf_desc->used)
+			continue;
+		rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
+		if (rc)
+			goto out;
+	}
+out:
+	mutex_unlock(lock);
+	return rc;
+}
+
+/* map all used buffers of lgr for a new link */
+int smcr_buf_map_lgr(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+	int i, rc = 0;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
+				       &lgr->rmbs[i], true);
+		if (rc)
+			return rc;
+		rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
+				       &lgr->sndbufs[i], false);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+/* register all used buffers of lgr for a new link */
+int smcr_buf_reg_lgr(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+	struct smc_buf_desc *buf_desc, *bf;
+	int i, rc = 0;
+
+	mutex_lock(&lgr->rmbs_lock);
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
+			if (!buf_desc->used)
+				continue;
+			rc = smcr_link_reg_rmb(lnk, buf_desc);
+			if (rc)
+				goto out;
+		}
+	}
+out:
+	mutex_unlock(&lgr->rmbs_lock);
+	return rc;
+}
+
 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
 						bool is_rmb, int bufsize)
 {
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fa532a423fd7..61ddb5264936 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -368,6 +368,8 @@ int smc_core_init(void);
 void smc_core_exit(void);
 
 void smcr_link_clear(struct smc_link *lnk);
+int smcr_buf_map_lgr(struct smc_link *lnk);
+int smcr_buf_reg_lgr(struct smc_link *lnk);
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
-- 
cgit v1.2.3


From fbed3b37c89633eb602f4ec8e30186e601b793e5 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:04 +0200
Subject: net/smc: extend smc_llc_send_add_link() and
 smc_llc_send_delete_link()

All LLC sends are done from worker context only, so remove the prep
functions which were used to build the message before it was sent, and
add the function content into the respective send function
smc_llc_send_add_link() and smc_llc_send_delete_link().
Extend smc_llc_send_add_link() to include the qp_mtu value in the LLC
message, which is needed to establish a link after the initial link was
created. Extend smc_llc_send_delete_link() to contain a link_id and a
reason code for the link deletion in the LLC message, which is needed
when a specific link should be deleted.
And add the list of existing DELETE_LINK reason codes.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  3 +-
 net/smc/smc_llc.c  | 89 ++++++++++++++++++++++++++++--------------------------
 net/smc/smc_llc.h  | 16 ++++++++--
 3 files changed, 62 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 0e87f652caea..c905675017c7 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -199,7 +199,8 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
 static int smcr_link_send_delete(struct smc_link *lnk, bool orderly)
 {
 	if (lnk->state == SMC_LNK_ACTIVE &&
-	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
+	    !smc_llc_send_delete_link(lnk, 0, SMC_LLC_REQ, orderly,
+				      SMC_LLC_DEL_PROG_INIT_TERM)) {
 		return 0;
 	}
 	return -ENOTCONN;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 327cf30b98cc..171835926db6 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -58,7 +58,13 @@ struct smc_llc_msg_add_link {		/* type 0x02 */
 	u8 sender_gid[SMC_GID_SIZE];
 	u8 sender_qp_num[3];
 	u8 link_num;
-	u8 flags2;	/* QP mtu */
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 reserved3 : 4,
+	   qp_mtu   : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 qp_mtu   : 4,
+	   reserved3 : 4;
+#endif
 	u8 initial_psn[3];
 	u8 reserved[8];
 };
@@ -427,26 +433,9 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 	return rc;
 }
 
-/* prepare an add link message */
-static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
-				  struct smc_link *link, u8 mac[], u8 gid[],
-				  enum smc_llc_reqresp reqresp)
-{
-	memset(addllc, 0, sizeof(*addllc));
-	addllc->hd.common.type = SMC_LLC_ADD_LINK;
-	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
-	if (reqresp == SMC_LLC_RESP) {
-		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
-		/* always reject more links for now */
-		addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
-		addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
-	}
-	memcpy(addllc->sender_mac, mac, ETH_ALEN);
-	memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
-}
-
 /* send ADD LINK request or response */
 int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+			  struct smc_link *link_new,
 			  enum smc_llc_reqresp reqresp)
 {
 	struct smc_llc_msg_add_link *addllc;
@@ -458,32 +447,33 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 	if (rc)
 		return rc;
 	addllc = (struct smc_llc_msg_add_link *)wr_buf;
-	smc_llc_prep_add_link(addllc, link, mac, gid, reqresp);
+
+	memset(addllc, 0, sizeof(*addllc));
+	addllc->hd.common.type = SMC_LLC_ADD_LINK;
+	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	if (reqresp == SMC_LLC_RESP)
+		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	memcpy(addllc->sender_mac, mac, ETH_ALEN);
+	memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+	if (link_new) {
+		addllc->link_num = link_new->link_id;
+		hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num);
+		hton24(addllc->initial_psn, link_new->psn_initial);
+		if (reqresp == SMC_LLC_REQ)
+			addllc->qp_mtu = link_new->path_mtu;
+		else
+			addllc->qp_mtu = min(link_new->path_mtu,
+					     link_new->peer_mtu);
+	}
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
 }
 
-/* prepare a delete link message */
-static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
-				     struct smc_link *link,
-				     enum smc_llc_reqresp reqresp, bool orderly)
-{
-	memset(delllc, 0, sizeof(*delllc));
-	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
-	delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
-	if (reqresp == SMC_LLC_RESP)
-		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
-	/* DEL_LINK_ALL because only 1 link supported */
-	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
-	if (orderly)
-		delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
-	delllc->link_num = link->link_id;
-}
-
 /* send DELETE LINK request or response */
-int smc_llc_send_delete_link(struct smc_link *link,
-			     enum smc_llc_reqresp reqresp, bool orderly)
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+			     enum smc_llc_reqresp reqresp, bool orderly,
+			     u32 reason)
 {
 	struct smc_llc_msg_del_link *delllc;
 	struct smc_wr_tx_pend_priv *pend;
@@ -494,7 +484,19 @@ int smc_llc_send_delete_link(struct smc_link *link,
 	if (rc)
 		return rc;
 	delllc = (struct smc_llc_msg_del_link *)wr_buf;
-	smc_llc_prep_delete_link(delllc, link, reqresp, orderly);
+
+	memset(delllc, 0, sizeof(*delllc));
+	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
+	delllc->hd.length = sizeof(struct smc_llc_msg_del_link);
+	if (reqresp == SMC_LLC_RESP)
+		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	if (orderly)
+		delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+	if (link_del_id)
+		delllc->link_num = link_del_id;
+	else
+		delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+	delllc->reason = htonl(reason);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -547,12 +549,13 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 	smc_lgr_forget(lgr);
 	if (lgr->role == SMC_SERV) {
 		/* client asks to delete this link, send request */
-		smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
+		smc_llc_send_delete_link(link, 0, SMC_LLC_REQ, true,
+					 SMC_LLC_DEL_PROG_INIT_TERM);
 	} else {
 		/* server requests to delete this link, send response */
-		smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
+		smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true,
+					 SMC_LLC_DEL_PROG_INIT_TERM);
 	}
-	smc_llc_send_message(link, llc);
 	smc_lgr_terminate_sched(lgr);
 }
 
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 48029a5e14c3..d2c50d3e43a6 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -35,6 +35,16 @@ enum smc_llc_msg_type {
 	SMC_LLC_DELETE_RKEY		= 0x09,
 };
 
+/* LLC DELETE LINK Request Reason Codes */
+#define SMC_LLC_DEL_LOST_PATH		0x00010000
+#define SMC_LLC_DEL_OP_INIT_TERM	0x00020000
+#define SMC_LLC_DEL_PROG_INIT_TERM	0x00030000
+#define SMC_LLC_DEL_PROT_VIOL		0x00040000
+#define SMC_LLC_DEL_NO_ASYM_NEEDED	0x00050000
+/* LLC DELETE LINK Response Reason Codes */
+#define SMC_LLC_DEL_NOLNK	0x00100000  /* Unknown Link ID (no link) */
+#define SMC_LLC_DEL_NOLGR	0x00200000  /* Unknown Link Group */
+
 /* returns a usable link of the link group, or NULL */
 static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
 {
@@ -50,9 +60,11 @@ static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
 int smc_llc_send_confirm_link(struct smc_link *lnk,
 			      enum smc_llc_reqresp reqresp);
 int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+			  struct smc_link *link_new,
 			  enum smc_llc_reqresp reqresp);
-int smc_llc_send_delete_link(struct smc_link *link,
-			     enum smc_llc_reqresp reqresp, bool orderly);
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+			     enum smc_llc_reqresp reqresp, bool orderly,
+			     u32 reason);
 void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
 void smc_llc_lgr_clear(struct smc_link_group *lgr);
 int smc_llc_link_init(struct smc_link *link);
-- 
cgit v1.2.3


From d550066776aae3bb31e0240cab24f62e33c47fd3 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:05 +0200
Subject: net/smc: mutex to protect the lgr against parallel reconfigurations

Introduce llc_conf_mutex in the link group which is used to protect the
buffers and lgr states against parallel link reconfiguration.
This ensures that new connections do not start to register buffers with
the links of a link group when link creation or termination is running.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  9 +++++++++
 net/smc/smc_core.c | 26 ++++++++++++++++++++++----
 net/smc/smc_core.h |  2 ++
 net/smc/smc_llc.c  |  9 +--------
 4 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 20d6d3fbb86c..6663a63be9e4 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -344,6 +344,13 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
 	struct smc_link_group *lgr = link->lgr;
 	int i, rc = 0;
 
+	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+	if (rc)
+		return rc;
+	/* protect against parallel smc_llc_cli_rkey_exchange() and
+	 * parallel smcr_link_reg_rmb()
+	 */
+	mutex_lock(&lgr->llc_conf_mutex);
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 		if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
 			continue;
@@ -360,6 +367,8 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
 	}
 	rmb_desc->is_conf_rkey = true;
 out:
+	mutex_unlock(&lgr->llc_conf_mutex);
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 	return rc;
 }
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index c905675017c7..4c3af05d76a5 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -448,11 +448,21 @@ out:
 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
 			   struct smc_link_group *lgr)
 {
+	int rc;
+
 	if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
 		/* unregister rmb with peer */
-		smc_llc_do_delete_rkey(lgr, rmb_desc);
-		rmb_desc->is_conf_rkey = false;
+		rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+		if (!rc) {
+			/* protect against smc_llc_cli_rkey_exchange() */
+			mutex_lock(&lgr->llc_conf_mutex);
+			smc_llc_do_delete_rkey(lgr, rmb_desc);
+			rmb_desc->is_conf_rkey = false;
+			mutex_unlock(&lgr->llc_conf_mutex);
+			smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+		}
 	}
+
 	if (rmb_desc->is_reg_err) {
 		/* buf registration failed, reuse not possible */
 		mutex_lock(&lgr->rmbs_lock);
@@ -552,6 +562,7 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk)
 	}
 }
 
+/* must be called under lgr->llc_conf_mutex lock */
 void smcr_link_clear(struct smc_link *lnk)
 {
 	struct smc_ib_device *smcibdev;
@@ -1170,7 +1181,9 @@ free_table:
 	return rc;
 }
 
-/* register a new rmb on IB device */
+/* register a new rmb on IB device,
+ * must be called under lgr->llc_conf_mutex lock
+ */
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
 {
 	if (list_empty(&link->lgr->list))
@@ -1224,7 +1237,9 @@ int smcr_buf_map_lgr(struct smc_link *lnk)
 	return 0;
 }
 
-/* register all used buffers of lgr for a new link */
+/* register all used buffers of lgr for a new link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
 int smcr_buf_reg_lgr(struct smc_link *lnk)
 {
 	struct smc_link_group *lgr = lnk->lgr;
@@ -1278,6 +1293,8 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 {
 	int i, rc = 0;
 
+	/* protect against parallel link reconfiguration */
+	mutex_lock(&lgr->llc_conf_mutex);
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 		struct smc_link *lnk = &lgr->lnk[i];
 
@@ -1289,6 +1306,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
 		}
 	}
 out:
+	mutex_unlock(&lgr->llc_conf_mutex);
 	return rc;
 }
 
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 61ddb5264936..aa198dd0f0e4 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -248,6 +248,8 @@ struct smc_link_group {
 						/* queue for llc events */
 			spinlock_t		llc_event_q_lock;
 						/* protects llc_event_q */
+			struct mutex		llc_conf_mutex;
+						/* protects lgr reconfig. */
 			struct work_struct	llc_event_work;
 						/* llc event worker */
 			wait_queue_head_t	llc_waiter;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 171835926db6..ceed3c89926f 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -848,6 +848,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 	spin_lock_init(&lgr->llc_event_q_lock);
 	spin_lock_init(&lgr->llc_flow_lock);
 	init_waitqueue_head(&lgr->llc_waiter);
+	mutex_init(&lgr->llc_conf_mutex);
 	lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time;
 }
 
@@ -897,9 +898,6 @@ int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 	struct smc_llc_qentry *qentry = NULL;
 	int rc = 0;
 
-	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
-	if (rc)
-		return rc;
 	rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
 	if (rc)
 		goto out;
@@ -911,7 +909,6 @@ int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 out:
 	if (qentry)
 		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
-	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 	return rc;
 }
 
@@ -927,9 +924,6 @@ int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
 	if (!send_link)
 		return -ENOLINK;
 
-	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
-	if (rc)
-		return rc;
 	/* protected by llc_flow control */
 	rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
 	if (rc)
@@ -942,7 +936,6 @@ int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
 out:
 	if (qentry)
 		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
-	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 35dcf7ec02dcff16504bc52a368822254f889f00 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:06 +0200
Subject: net/smc: remember PNETID of IB device for later device matching

The PNETID is needed to find an alternate link for a link group.
Save the PNETID of the link that is used to create the link group for
later device matching.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 ++
 net/smc/smc_core.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4c3af05d76a5..d7ab92fc5b15 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -413,6 +413,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
 		       SMC_SYSTEMID_LEN);
+		memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1],
+		       SMC_MAX_PNETID_LEN);
 		smc_llc_lgr_init(lgr, smc);
 
 		link_idx = SMC_SINGLE_LINK;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index aa198dd0f0e4..413eaad50c7f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -244,6 +244,8 @@ struct smc_link_group {
 			u8			next_link_id;
 			enum smc_lgr_type	type;
 						/* redundancy state */
+			u8			pnet_id[SMC_MAX_PNETID_LEN + 1];
+						/* pnet id of this lgr */
 			struct list_head	llc_event_q;
 						/* queue for llc events */
 			spinlock_t		llc_event_q_lock;
-- 
cgit v1.2.3


From 1f90a05d9ff907c70456e7c9d7058372679a88c6 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:07 +0200
Subject: net/smc: add smcr_port_add() and smcr_link_up() processing

Call smcr_port_add() when an IB event reports a new active IB device.
smcr_port_add() will start a work which either triggers the local
ADD_LINK processing, or send an ADD_LINK LLC message to the SMC server
to initiate the processing.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_core.h |  1 +
 net/smc/smc_ib.c   |  1 +
 3 files changed, 88 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d7ab92fc5b15..20bc9e46bf52 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -44,10 +44,19 @@ static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
 static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
 
+struct smc_ib_up_work {
+	struct work_struct	work;
+	struct smc_link_group	*lgr;
+	struct smc_ib_device	*smcibdev;
+	u8			ibport;
+};
+
 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
 			 struct smc_buf_desc *buf_desc);
 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
 
+static void smc_link_up_work(struct work_struct *work);
+
 /* return head of link group list and its lock for a given link group */
 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
 						  spinlock_t **lgr_lock)
@@ -928,6 +937,83 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 	}
 }
 
+/* link is up - establish alternate link if applicable */
+static void smcr_link_up(struct smc_link_group *lgr,
+			 struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link *link = NULL;
+
+	if (list_empty(&lgr->list) ||
+	    lgr->type == SMC_LGR_SYMMETRIC ||
+	    lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+		return;
+
+	if (lgr->role == SMC_SERV) {
+		/* trigger local add link processing */
+		link = smc_llc_usable_link(lgr);
+		if (!link)
+			return;
+		/* tbd: call smc_llc_srv_add_link_local(link); */
+	} else {
+		/* invite server to start add link processing */
+		u8 gid[SMC_GID_SIZE];
+
+		if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid,
+					 NULL))
+			return;
+		if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+			/* some other llc task is ongoing */
+			wait_event_interruptible_timeout(lgr->llc_waiter,
+				(lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
+				SMC_LLC_WAIT_TIME);
+		}
+		if (list_empty(&lgr->list) ||
+		    !smc_ib_port_active(smcibdev, ibport))
+			return; /* lgr or device no longer active */
+		link = smc_llc_usable_link(lgr);
+		if (!link)
+			return;
+		smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid,
+				      NULL, SMC_LLC_REQ);
+	}
+}
+
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_ib_up_work *ib_work;
+	struct smc_link_group *lgr, *n;
+
+	list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+		if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+			    SMC_MAX_PNETID_LEN) ||
+		    lgr->type == SMC_LGR_SYMMETRIC ||
+		    lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+			continue;
+		ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL);
+		if (!ib_work)
+			continue;
+		INIT_WORK(&ib_work->work, smc_link_up_work);
+		ib_work->lgr = lgr;
+		ib_work->smcibdev = smcibdev;
+		ib_work->ibport = ibport;
+		schedule_work(&ib_work->work);
+	}
+}
+
+static void smc_link_up_work(struct work_struct *work)
+{
+	struct smc_ib_up_work *ib_work = container_of(work,
+						      struct smc_ib_up_work,
+						      work);
+	struct smc_link_group *lgr = ib_work->lgr;
+
+	if (list_empty(&lgr->list))
+		goto out;
+	smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport);
+out:
+	kfree(ib_work);
+}
+
 /* Determine vlan of internal TCP socket.
  * @vlan_id: address to store the determined vlan id into
  */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 413eaad50c7f..86453ad83491 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -345,6 +345,7 @@ void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_cleanup_early(struct smc_connection *conn);
 void smc_lgr_terminate_sched(struct smc_link_group *lgr);
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
 			unsigned short vlan);
 void smc_smcd_terminate_all(struct smcd_dev *dev);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index c090678a3e5a..545fb0bc3714 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -252,6 +252,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
 			smc_port_terminate(smcibdev, port_idx + 1);
 		} else {
 			clear_bit(port_idx, smcibdev->ports_going_away);
+			smcr_port_add(smcibdev, port_idx + 1);
 		}
 	}
 }
-- 
cgit v1.2.3


From 541afa10c126b6c22c2a805a559c70cc41fd156e Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:08 +0200
Subject: net/smc: add smcr_port_err() and smcr_link_down() processing

Call smcr_port_err() when an IB event reports an inactive IB device.
smcr_port_err() calls smcr_link_down() for all affected links.
smcr_link_down() either triggers the local DELETE_LINK processing, or
sends an DELETE_LINK LLC message to the SMC server to initiate the
processing.
The old handler function smc_port_terminate() is removed.
Add helper smcr_link_down_cond() to take a link down conditionally, and
smcr_link_down_cond_sched() to schedule the link_down processing to a
work.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 119 +++++++++++++++++++++++++++++++++++++++--------------
 net/smc/smc_core.h |   6 ++-
 net/smc/smc_ib.c   |   2 +-
 net/smc/smc_llc.h  |   3 ++
 4 files changed, 98 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 20bc9e46bf52..62108e0cd529 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -56,6 +56,7 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
 
 static void smc_link_up_work(struct work_struct *work);
+static void smc_link_down_work(struct work_struct *work);
 
 /* return head of link group list and its lock for a given link group */
 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
@@ -320,6 +321,7 @@ static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->smcibdev = ini->ib_dev;
 	lnk->ibport = ini->ib_port;
 	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+	INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
 	if (!ini->ib_dev->initialized) {
 		rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
 		if (rc)
@@ -818,36 +820,6 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr)
 	schedule_work(&lgr->terminate_work);
 }
 
-/* Called when IB port is terminated */
-void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
-{
-	struct smc_link_group *lgr, *l;
-	LIST_HEAD(lgr_free_list);
-	int i;
-
-	spin_lock_bh(&smc_lgr_list.lock);
-	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
-		if (lgr->is_smcd)
-			continue;
-		/* tbd - terminate only when no more links are active */
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (!smc_link_usable(&lgr->lnk[i]))
-				continue;
-			if (lgr->lnk[i].smcibdev == smcibdev &&
-			    lgr->lnk[i].ibport == ibport) {
-				list_move(&lgr->list, &lgr_free_list);
-				lgr->freeing = 1;
-			}
-		}
-	}
-	spin_unlock_bh(&smc_lgr_list.lock);
-
-	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
-		list_del_init(&lgr->list);
-		__smc_lgr_terminate(lgr, false);
-	}
-}
-
 /* Called when peer lgr shutdown (regularly or abnormally) is received */
 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
 {
@@ -1000,6 +972,79 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
 	}
 }
 
+/* link is down - switch connections to alternate link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+static void smcr_link_down(struct smc_link *lnk)
+{
+	struct smc_link_group *lgr = lnk->lgr;
+	struct smc_link *to_lnk;
+	int del_link_id;
+
+	if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
+		return;
+
+	smc_ib_modify_qp_reset(lnk);
+	to_lnk = NULL;
+	/* tbd: call to_lnk = smc_switch_conns(lgr, lnk, true); */
+	if (!to_lnk) { /* no backup link available */
+		smcr_link_clear(lnk);
+		return;
+	}
+	lgr->type = SMC_LGR_SINGLE;
+	del_link_id = lnk->link_id;
+
+	if (lgr->role == SMC_SERV) {
+		/* trigger local delete link processing */
+	} else {
+		if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+			/* another llc task is ongoing */
+			mutex_unlock(&lgr->llc_conf_mutex);
+			wait_event_interruptible_timeout(lgr->llc_waiter,
+				(lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
+				SMC_LLC_WAIT_TIME);
+			mutex_lock(&lgr->llc_conf_mutex);
+		}
+		smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true,
+					 SMC_LLC_DEL_LOST_PATH);
+	}
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_down_cond(struct smc_link *lnk)
+{
+	if (smc_link_downing(&lnk->state))
+		smcr_link_down(lnk);
+}
+
+/* will get the lgr->llc_conf_mutex lock */
+void smcr_link_down_cond_sched(struct smc_link *lnk)
+{
+	if (smc_link_downing(&lnk->state))
+		schedule_work(&lnk->link_down_wrk);
+}
+
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link_group *lgr, *n;
+	int i;
+
+	list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+		if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+			    SMC_MAX_PNETID_LEN))
+			continue; /* lgr is not affected */
+		if (list_empty(&lgr->list))
+			continue;
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *lnk = &lgr->lnk[i];
+
+			if (smc_link_usable(lnk) &&
+			    lnk->smcibdev == smcibdev && lnk->ibport == ibport)
+				smcr_link_down_cond_sched(lnk);
+		}
+	}
+}
+
 static void smc_link_up_work(struct work_struct *work)
 {
 	struct smc_ib_up_work *ib_work = container_of(work,
@@ -1014,6 +1059,20 @@ out:
 	kfree(ib_work);
 }
 
+static void smc_link_down_work(struct work_struct *work)
+{
+	struct smc_link *link = container_of(work, struct smc_link,
+					     link_down_wrk);
+	struct smc_link_group *lgr = link->lgr;
+
+	if (list_empty(&lgr->list))
+		return;
+	wake_up_interruptible_all(&lgr->llc_waiter);
+	mutex_lock(&lgr->llc_conf_mutex);
+	smcr_link_down(link);
+	mutex_unlock(&lgr->llc_conf_mutex);
+}
+
 /* Determine vlan of internal TCP socket.
  * @vlan_id: address to store the determined vlan id into
  */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 86453ad83491..da3cddbd1651 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@ struct smc_link {
 	u8			link_id;	/* unique # within link group */
 	u8			link_idx;	/* index in lgr link array */
 	struct smc_link_group	*lgr;		/* parent link group */
+	struct work_struct	link_down_wrk;	/* wrk to bring link down */
 
 	enum smc_link_state	state;		/* state of link */
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
@@ -344,8 +345,8 @@ struct smc_clc_msg_local;
 void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_cleanup_early(struct smc_connection *conn);
 void smc_lgr_terminate_sched(struct smc_link_group *lgr);
-void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
 void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
 			unsigned short vlan);
 void smc_smcd_terminate_all(struct smcd_dev *dev);
@@ -376,6 +377,9 @@ void smcr_link_clear(struct smc_link *lnk);
 int smcr_buf_map_lgr(struct smc_link *lnk);
 int smcr_buf_reg_lgr(struct smc_link *lnk);
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
+void smcr_link_down_cond(struct smc_link *lnk);
+void smcr_link_down_cond_sched(struct smc_link *lnk);
+
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
 	return link->lgr;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 545fb0bc3714..2c743caad69a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -249,7 +249,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
 		clear_bit(port_idx, &smcibdev->port_event_mask);
 		if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
 			set_bit(port_idx, smcibdev->ports_going_away);
-			smc_port_terminate(smcibdev, port_idx + 1);
+			smcr_port_err(smcibdev, port_idx + 1);
 		} else {
 			clear_bit(port_idx, smcibdev->ports_going_away);
 			smcr_port_add(smcibdev, port_idx + 1);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index d2c50d3e43a6..4ed4486e5082 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -35,6 +35,9 @@ enum smc_llc_msg_type {
 	SMC_LLC_DELETE_RKEY		= 0x09,
 };
 
+#define smc_link_downing(state) \
+	(cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE)
+
 /* LLC DELETE LINK Request Reason Codes */
 #define SMC_LLC_DEL_LOST_PATH		0x00010000
 #define SMC_LLC_DEL_OP_INIT_TERM	0x00020000
-- 
cgit v1.2.3


From 87523930a16eb57ebb20318e92b5df4b64fe8b20 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:09 +0200
Subject: net/smc: take link down instead of terminating the link group

Use the introduced link down processing in all places where the link
group is terminated and take down the affected link only.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  7 ++-----
 net/smc/smc_llc.c  |  4 ++--
 net/smc/smc_tx.c   |  2 +-
 net/smc/smc_wr.c   | 19 ++++++++-----------
 4 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 62108e0cd529..849ae3f9b796 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -884,11 +884,8 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 	} else {
 		list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
 			for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-				if (lgr->lnk[i].smcibdev == smcibdev) {
-					list_move(&lgr->list, &lgr_free_list);
-					lgr->freeing = 1;
-					break;
-				}
+				if (lgr->lnk[i].smcibdev == smcibdev)
+					smcr_link_down_cond_sched(&lgr->lnk[i]);
 			}
 		}
 	}
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ceed3c89926f..e478a4c11877 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -556,7 +556,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 		smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true,
 					 SMC_LLC_DEL_PROG_INIT_TERM);
 	}
-	smc_lgr_terminate_sched(lgr);
+	smcr_link_down_cond(link);
 }
 
 /* process a confirm_rkey request from peer, remote flow */
@@ -831,7 +831,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
 	if (link->state != SMC_LNK_ACTIVE)
 		return;		/* link state changed */
 	if (rc <= 0) {
-		smc_lgr_terminate_sched(smc_get_lgr(link));
+		smcr_link_down_cond_sched(link);
 		return;
 	}
 	next_interval = link->llc_testlink_time;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index d74bfe6a90f1..417204572a69 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -283,7 +283,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
 	rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
 	rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
 	if (rc)
-		smc_lgr_terminate_sched(lgr);
+		smcr_link_down_cond_sched(link);
 	return rc;
 }
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 93223628c002..031e6c9561b1 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -120,8 +120,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 			       sizeof(link->wr_tx_bufs[i]));
 			clear_bit(i, link->wr_tx_mask);
 		}
-		/* terminate connections of this link group abnormally */
-		smc_lgr_terminate_sched(smc_get_lgr(link));
+		/* terminate link */
+		smcr_link_down_cond_sched(link);
 	}
 	if (pnd_snd.handler)
 		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -212,8 +212,8 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
 			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
 		if (!rc) {
-			/* timeout - terminate connections */
-			smc_lgr_terminate_sched(lgr);
+			/* timeout - terminate link */
+			smcr_link_down_cond_sched(link);
 			return -EPIPE;
 		}
 		if (idx == link->wr_tx_cnt)
@@ -270,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
 	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
 	if (rc) {
 		smc_wr_tx_put_slot(link, priv);
-		smc_lgr_terminate_sched(smc_get_lgr(link));
+		smcr_link_down_cond_sched(link);
 	}
 	return rc;
 }
@@ -294,8 +294,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
 					      (link->wr_reg_state != POSTED),
 					      SMC_WR_REG_MR_WAIT_TIME);
 	if (!rc) {
-		/* timeout - terminate connections */
-		smc_lgr_terminate_sched(smc_get_lgr(link));
+		/* timeout - terminate link */
+		smcr_link_down_cond_sched(link);
 		return -EPIPE;
 	}
 	if (rc == -ERESTARTSYS)
@@ -393,10 +393,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RETRY_EXC_ERR:
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
-				/* terminate connections of this link group
-				 * abnormally
-				 */
-				smc_lgr_terminate_sched(smc_get_lgr(link));
+				smcr_link_down_cond_sched(link);
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
-- 
cgit v1.2.3


From 33d203302d1cc744a13349d2576c985feb469220 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:10 +0200
Subject: net/smc: remove DELETE LINK processing from smc_core.c

Support for multiple links makes the former DELETE LINK processing
obsolete which sent one DELETE_LINK LLC message for each single link.
Remove this processing from smc_core.c.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 849ae3f9b796..60c708f6de51 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -202,20 +202,6 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
 	smc_lgr_schedule_free_work_fast(lgr);
 }
 
-/* Send delete link, either as client to request the initiation
- * of the DELETE LINK sequence from server; or as server to
- * initiate the delete processing. See smc_llc_rx_delete_link().
- */
-static int smcr_link_send_delete(struct smc_link *lnk, bool orderly)
-{
-	if (lnk->state == SMC_LNK_ACTIVE &&
-	    !smc_llc_send_delete_link(lnk, 0, SMC_LLC_REQ, orderly,
-				      SMC_LLC_DEL_PROG_INIT_TERM)) {
-		return 0;
-	}
-	return -ENOTCONN;
-}
-
 static void smc_lgr_free(struct smc_link_group *lgr);
 
 static void smc_lgr_free_work(struct work_struct *work)
@@ -241,25 +227,6 @@ static void smc_lgr_free_work(struct work_struct *work)
 		return;
 	}
 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
-
-	if (!lgr->is_smcd && !lgr->terminating)	{
-		bool do_wait = false;
-
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			struct smc_link *lnk = &lgr->lnk[i];
-			/* try to send del link msg, on err free immediately */
-			if (lnk->state == SMC_LNK_ACTIVE &&
-			    !smcr_link_send_delete(lnk, true)) {
-				/* reschedule in case we never receive a resp */
-				smc_lgr_schedule_free_work(lgr);
-				do_wait = true;
-			}
-		}
-		if (do_wait) {
-			spin_unlock_bh(lgr_lock);
-			return; /* wait for resp, see smc_llc_rx_delete_link */
-		}
-	}
 	lgr->freeing = 1; /* this instance does the freeing, no new schedule */
 	spin_unlock_bh(lgr_lock);
 	cancel_delayed_work(&lgr->free_work);
-- 
cgit v1.2.3


From 6c868a3edc70ec9819d6a94268625d25e6bc9587 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:11 +0200
Subject: net/smc: introduce smc_pnet_find_alt_roce()

Introduce a new function in smc_pnet.c that searches for an alternate
IB device, using an existing link group and a primary IB device. The
alternate IB device needs to be active and must have the same PNETID
as the link group.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 15 +++++++++++++--
 net/smc/smc_pnet.h |  5 ++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index bd01c71b827a..50c96e843fab 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -777,7 +777,8 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
 
 /* find a roce device for the given pnetid */
 static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
-					  struct smc_init_info *ini)
+					  struct smc_init_info *ini,
+					  struct smc_ib_device *known_dev)
 {
 	struct smc_ib_device *ibdev;
 	int i;
@@ -785,6 +786,8 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
 	ini->ib_dev = NULL;
 	spin_lock(&smc_ib_devices.lock);
 	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+		if (ibdev == known_dev)
+			continue;
 		for (i = 1; i <= SMC_MAX_PORTS; i++) {
 			if (!rdma_is_port_valid(ibdev->ibdev, i))
 				continue;
@@ -803,6 +806,14 @@ out:
 	spin_unlock(&smc_ib_devices.lock);
 }
 
+/* find alternate roce device with same pnet_id and vlan_id */
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+			    struct smc_init_info *ini,
+			    struct smc_ib_device *known_dev)
+{
+	_smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev);
+}
+
 /* if handshake network device belongs to a roce device, return its
  * IB device and port
  */
@@ -857,7 +868,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
 		smc_pnet_find_rdma_dev(ndev, ini);
 		return; /* pnetid could not be determined */
 	}
-	_smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini);
+	_smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL);
 }
 
 static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index ea207f8fc6f7..811a65986691 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -19,6 +19,7 @@
 struct smc_ib_device;
 struct smcd_dev;
 struct smc_init_info;
+struct smc_link_group;
 
 /**
  * struct smc_pnettable - SMC PNET table anchor
@@ -48,5 +49,7 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
 void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
 int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
 int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
-
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+			    struct smc_init_info *ini,
+			    struct smc_ib_device *known_dev);
 #endif
-- 
cgit v1.2.3


From 8574cf4055ab44724ee9a4c30921d3ed853d787c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:12 +0200
Subject: net/smc: allocate index for a new link

Add smc_llc_alloc_alt_link() to find a free link index for a new link,
depending on the new link group type. And update constants for the
maximum number of links to 3 (2 symmetric and 1 dangling asymmetric link).
These maximum numbers are the same as used by other implementations of the
SMC-R protocol.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h |  2 +-
 net/smc/smc_llc.c  | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index da3cddbd1651..eb27f2eb7c8c 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -128,7 +128,7 @@ struct smc_link {
 /* For now we just allow one parallel link per link group. The SMC protocol
  * allows more (up to 8).
  */
-#define SMC_LINKS_PER_LGR_MAX	1
+#define SMC_LINKS_PER_LGR_MAX	3
 #define SMC_SINGLE_LINK		0
 
 #define SMC_FIRST_CONTACT	1		/* first contact to a peer */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e478a4c11877..3a25b6ebe3a8 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -541,6 +541,30 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 
 /********************************* receive ***********************************/
 
+static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
+				  enum smc_lgr_type lgr_new_t)
+{
+	int i;
+
+	if (lgr->type == SMC_LGR_SYMMETRIC ||
+	    (lgr->type != SMC_LGR_SINGLE &&
+	     (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	      lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
+		return -EMLINK;
+
+	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
+		for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
+			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+				return i;
+	} else {
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+				return i;
+	}
+	return -EMLINK;
+}
+
 static void smc_llc_rx_delete_link(struct smc_link *link,
 				   struct smc_llc_msg_del_link *llc)
 {
-- 
cgit v1.2.3


From b45e7f98ab7c2d7035d92100ee011584693eccce Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Fri, 1 May 2020 12:48:13 +0200
Subject: net/smc: llc_add_link_work to handle ADD_LINK LLC requests

Introduce a work that is scheduled when a new ADD_LINK LLC request is
received. The work will call either the SMC client or SMC server
ADD_LINK processing.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h |  1 +
 net/smc/smc_llc.c  | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index eb27f2eb7c8c..555ada9d2423 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -253,6 +253,7 @@ struct smc_link_group {
 						/* protects llc_event_q */
 			struct mutex		llc_conf_mutex;
 						/* protects lgr reconfig. */
+			struct work_struct	llc_add_link_work;
 			struct work_struct	llc_event_work;
 						/* llc event worker */
 			wait_queue_head_t	llc_waiter;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 3a25b6ebe3a8..50f59746bdf9 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -565,6 +565,24 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
 	return -EMLINK;
 }
 
+/* worker to process an add link message */
+static void smc_llc_add_link_work(struct work_struct *work)
+{
+	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+						  llc_add_link_work);
+
+	if (list_empty(&lgr->list)) {
+		/* link group is terminating */
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+		goto out;
+	}
+
+	/* tbd: call smc_llc_process_cli_add_link(lgr); */
+	/* tbd: call smc_llc_process_srv_add_link(lgr); */
+out:
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
 static void smc_llc_rx_delete_link(struct smc_link *link,
 				   struct smc_llc_msg_del_link *llc)
 {
@@ -685,11 +703,11 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 				wake_up_interruptible(&lgr->llc_waiter);
 			} else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
 						      qentry)) {
-				/* tbd: schedule_work(&lgr->llc_add_link_work); */
+				schedule_work(&lgr->llc_add_link_work);
 			}
 		} else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
 			/* as smc server, handle client suggestion */
-			/* tbd: schedule_work(&lgr->llc_add_link_work); */
+			schedule_work(&lgr->llc_add_link_work);
 		}
 		return;
 	case SMC_LLC_CONFIRM_LINK:
@@ -868,6 +886,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 	struct net *net = sock_net(smc->clcsock->sk);
 
 	INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
+	INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
 	INIT_LIST_HEAD(&lgr->llc_event_q);
 	spin_lock_init(&lgr->llc_event_q_lock);
 	spin_lock_init(&lgr->llc_flow_lock);
@@ -882,6 +901,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 	smc_llc_event_flush(lgr);
 	wake_up_interruptible_all(&lgr->llc_waiter);
 	cancel_work_sync(&lgr->llc_event_work);
+	cancel_work_sync(&lgr->llc_add_link_work);
 	if (lgr->delayed_event) {
 		kfree(lgr->delayed_event);
 		lgr->delayed_event = NULL;
-- 
cgit v1.2.3


From 40b94224c339e44f689e713875c6c27c9c1270a7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 2 May 2020 16:41:06 -0700
Subject: smc: Remove unused function.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

net/smc/smc_llc.c:544:12: warning: ‘smc_llc_alloc_alt_link’ defined but not used [-Wunused-function]
 static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
            ^~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 50f59746bdf9..4e3db4d4b783 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -541,30 +541,6 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 
 /********************************* receive ***********************************/
 
-static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
-				  enum smc_lgr_type lgr_new_t)
-{
-	int i;
-
-	if (lgr->type == SMC_LGR_SYMMETRIC ||
-	    (lgr->type != SMC_LGR_SINGLE &&
-	     (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
-	      lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
-		return -EMLINK;
-
-	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
-	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
-		for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
-			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
-				return i;
-	} else {
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
-			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
-				return i;
-	}
-	return -EMLINK;
-}
-
 /* worker to process an add link message */
 static void smc_llc_add_link_work(struct work_struct *work)
 {
-- 
cgit v1.2.3


From ee1bd483cc062d5050f9537064651dd2e06baee7 Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Sat, 2 May 2020 18:34:42 +0300
Subject: inet_diag: bc: read cgroup id only for full sockets

Fix bug introduced by commit b1f3e43dbfac ("inet_diag: add support for
cgroup filter").

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Reported-by: syzbot+ee80f840d9bf6893223b@syzkaller.appspotmail.com
Reported-by: syzbot+13bef047dbfffa5cd1af@syzkaller.appspotmail.com
Fixes: b1f3e43dbfac ("inet_diag: add support for cgroup filter")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 0034092358c3..125f4f8a36b4 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -746,7 +746,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 	else
 		entry.mark = 0;
 #ifdef CONFIG_SOCK_CGROUP_DATA
-	entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data));
+	entry.cgroup_id = sk_fullsock(sk) ?
+		cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
 #endif
 
 	return inet_diag_bc_run(bc, &entry);
-- 
cgit v1.2.3


From dde0a648fc00e2156a3358600c5fbfb3f53256ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 2 May 2020 19:54:18 -0700
Subject: net_sched: sch_fq: avoid touching f->next from fq_gc()

A significant amount of cpu cycles is spent in fq_gc()

When fq_gc() does its lookup in the rb-tree, it needs the
following fields from struct fq_flow :

f->sk       (lookup key in the rb-tree)
f->fq_node  (anchor in the rb-tree)
f->next     (used to determine if the flow is detached)
f->age      (used to determine if the flow is candidate for gc)

This unfortunately spans two cache lines (assuming 64 bytes cache lines)

We can avoid using f->next, if we use the low order bit of f->{age|tail}

This low order bit is 0, if f->tail points to an sk_buff.
We set the low order bit to 1, if the union contains a jiffies value.

Combined with the following patch, this makes sure we only need
to bring into cpu caches one cache line per flow.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4c060134c736..bc9ca1ba507b 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -70,14 +70,14 @@ struct fq_flow {
 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
 	union {
 		struct sk_buff *tail;	/* last skb in the list */
-		unsigned long  age;	/* jiffies when flow was emptied, for gc */
+		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
 	};
 	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
 	struct sock	*sk;
 	int		qlen;		/* number of packets in flow queue */
 	int		credit;
 	u32		socket_hash;	/* sk_hash */
-	struct fq_flow *next;		/* next pointer in RR lists, or &detached */
+	struct fq_flow *next;		/* next pointer in RR lists */
 
 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
 	u64		time_next_packet;
@@ -126,20 +126,25 @@ struct fq_sched_data {
 	struct qdisc_watchdog watchdog;
 };
 
-/* special value to mark a detached flow (not on old/new list) */
-static struct fq_flow detached, throttled;
-
+/*
+ * f->tail and f->age share the same location.
+ * We can use the low order bit to differentiate if this location points
+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
+ */
 static void fq_flow_set_detached(struct fq_flow *f)
 {
-	f->next = &detached;
-	f->age = jiffies;
+	f->age = jiffies | 1UL;
 }
 
 static bool fq_flow_is_detached(const struct fq_flow *f)
 {
-	return f->next == &detached;
+	return !!(f->age & 1UL);
 }
 
+/* special value to mark a throttled flow (not on old/new list) */
+static struct fq_flow throttled;
+
 static bool fq_flow_is_throttled(const struct fq_flow *f)
 {
 	return f->next == &throttled;
-- 
cgit v1.2.3


From 7ba0537c2b534149be288f851900b4cf5aacde48 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 2 May 2020 19:54:19 -0700
Subject: net_sched: sch_fq: change fq_flow size/layout

sizeof(struct fq_flow) is 112 bytes on 64bit arches.

This means that half of them use two cache lines, but 50% use
three cache lines.

This patch adds cache line alignment, and makes sure that only
the first cache line is touched by fq_enqueue(), which is more
expensive that fq_dequeue() in general.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index bc9ca1ba507b..ced1f987d7e4 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -66,6 +66,7 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
  * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
  */
 struct fq_flow {
+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
 	struct rb_root	t_root;
 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
 	union {
@@ -74,14 +75,18 @@ struct fq_flow {
 	};
 	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
 	struct sock	*sk;
+	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
+
+/* Second cache line, used in fq_dequeue() */
 	int		credit;
-	u32		socket_hash;	/* sk_hash */
+	/* 32bit hole on 64bit arches */
+
 	struct fq_flow *next;		/* next pointer in RR lists */
 
 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
 	u64		time_next_packet;
-};
+} ____cacheline_aligned_in_smp;
 
 struct fq_flow_head {
 	struct fq_flow *first;
-- 
cgit v1.2.3


From 82a0aa53b520edf50e08bad347d87d898de414eb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 2 May 2020 19:54:20 -0700
Subject: net_sched: sch_fq: use bulk freeing in fq_gc()

fq_gc() already builds a small array of pointers, so using
kmem_cache_free_bulk() needs very little change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index ced1f987d7e4..53ec47ff8469 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -214,9 +214,10 @@ static void fq_gc(struct fq_sched_data *q,
 		  struct rb_root *root,
 		  struct sock *sk)
 {
-	struct fq_flow *f, *tofree[FQ_GC_MAX];
 	struct rb_node **p, *parent;
-	int fcnt = 0;
+	void *tofree[FQ_GC_MAX];
+	struct fq_flow *f;
+	int i, fcnt = 0;
 
 	p = &root->rb_node;
 	parent = NULL;
@@ -239,15 +240,18 @@ static void fq_gc(struct fq_sched_data *q,
 			p = &parent->rb_left;
 	}
 
+	if (!fcnt)
+		return;
+
+	for (i = fcnt; i > 0; ) {
+		f = tofree[--i];
+		rb_erase(&f->fq_node, root);
+	}
 	q->flows -= fcnt;
 	q->inactive_flows -= fcnt;
 	q->stat_gc_flows += fcnt;
-	while (fcnt) {
-		struct fq_flow *f = tofree[--fcnt];
 
-		rb_erase(&f->fq_node, root);
-		kmem_cache_free(fq_flow_cachep, f);
-	}
+	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 }
 
 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
-- 
cgit v1.2.3


From c288b0ca86a0a49ad450cf2d76a9a70c2ca9e43f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 2 May 2020 19:54:21 -0700
Subject: net_sched: sch_fq: do not call fq_peek() twice per packet

This refactors the code to not call fq_peek() from fq_dequeue_head()
since the caller can provide the skb.

Also rename fq_dequeue_head() to fq_dequeue_skb() because 'head' is
a bit vague, given the skb could come from t_root rb-tree.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 53ec47ff8469..567df8fcaf70 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -384,19 +384,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 	}
 }
 
-/* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+/* Remove one skb from flow queue.
+ * This skb must be the return value of prior fq_peek().
+ */
+static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
+			   struct sk_buff *skb)
 {
-	struct sk_buff *skb = fq_peek(flow);
-
-	if (skb) {
-		fq_erase_head(sch, flow, skb);
-		skb_mark_not_on_list(skb);
-		flow->qlen--;
-		qdisc_qstats_backlog_dec(sch, skb);
-		sch->q.qlen--;
-	}
-	return skb;
+	fq_erase_head(sch, flow, skb);
+	skb_mark_not_on_list(skb);
+	flow->qlen--;
+	qdisc_qstats_backlog_dec(sch, skb);
+	sch->q.qlen--;
 }
 
 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
@@ -508,9 +506,11 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	if (!sch->q.qlen)
 		return NULL;
 
-	skb = fq_dequeue_head(sch, &q->internal);
-	if (skb)
+	skb = fq_peek(&q->internal);
+	if (unlikely(skb)) {
+		fq_dequeue_skb(sch, &q->internal, skb);
 		goto out;
+	}
 
 	now = ktime_get_ns();
 	fq_check_throttled(q, now);
@@ -550,10 +550,8 @@ begin:
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
 		}
-	}
-
-	skb = fq_dequeue_head(sch, f);
-	if (!skb) {
+		fq_dequeue_skb(sch, f, skb);
+	} else {
 		head->first = f->next;
 		/* force a pass through old_flows to prevent starvation */
 		if ((head == &q->new_flows) && q->old_flows.first) {
-- 
cgit v1.2.3


From 348e289b0f23451d1c47b940e759f0f3a0c5756e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 2 May 2020 19:54:22 -0700
Subject: net_sched: sch_fq: perform a prefetch() earlier

The prefetch() done in fq_dequeue() can be done a bit earlier
after the refactoring of the code done in the prior patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 567df8fcaf70..4f0104243cc2 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -546,6 +546,7 @@ begin:
 			fq_flow_set_throttled(q, f);
 			goto begin;
 		}
+		prefetch(&skb->end);
 		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
@@ -562,7 +563,6 @@ begin:
 		}
 		goto begin;
 	}
-	prefetch(&skb->end);
 	plen = qdisc_pkt_len(skb);
 	f->credit -= plen;
 
-- 
cgit v1.2.3


From 336ba09f2ef71b82f07c1200be0ddf4eb923d69f Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:40 +0200
Subject: net/smc: first part of add link processing as SMC client

First set of functions to process an ADD_LINK LLC request as an SMC
client. Find an alternate IB device, determine the new link group type
and get the index for the new link. Then ready the link, map the buffers
and send an ADD_LINK LLC response. If any error occurs, send a reject
LLC message and terminate the processing.
Add smc_llc_alloc_alt_link() to find a free link index for a new link,
depending on the new link group type.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |   4 +-
 net/smc/smc_core.h |   2 +
 net/smc/smc_llc.c  | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 60c708f6de51..2f8faa9c9e8e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -273,8 +273,8 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr)
 	return link_id;
 }
 
-static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
-			  u8 link_idx, struct smc_init_info *ini)
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+		   u8 link_idx, struct smc_init_info *ini)
 {
 	u8 rndvec[3];
 	int rc;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 555ada9d2423..4e00819e2db7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -374,6 +374,8 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
 int smc_core_init(void);
 void smc_core_exit(void);
 
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+		   u8 link_idx, struct smc_init_info *ini);
 void smcr_link_clear(struct smc_link *lnk);
 int smcr_buf_map_lgr(struct smc_link *lnk);
 int smcr_buf_reg_lgr(struct smc_link *lnk);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 4e3db4d4b783..8716d8739329 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -17,6 +17,7 @@
 #include "smc_core.h"
 #include "smc_clc.h"
 #include "smc_llc.h"
+#include "smc_pnet.h"
 
 #define SMC_LLC_DATA_LEN		40
 
@@ -541,6 +542,112 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 
 /********************************* receive ***********************************/
 
+static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
+				  enum smc_lgr_type lgr_new_t)
+{
+	int i;
+
+	if (lgr->type == SMC_LGR_SYMMETRIC ||
+	    (lgr->type != SMC_LGR_SINGLE &&
+	     (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	      lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
+		return -EMLINK;
+
+	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
+		for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
+			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+				return i;
+	} else {
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+			if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+				return i;
+	}
+	return -EMLINK;
+}
+
+/* prepare and send an add link reject response */
+static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
+{
+	qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+	qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+	qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+	return smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+static void smc_llc_save_add_link_info(struct smc_link *link,
+				       struct smc_llc_msg_add_link *add_llc)
+{
+	link->peer_qpn = ntoh24(add_llc->sender_qp_num);
+	memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE);
+	memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN);
+	link->peer_psn = ntoh24(add_llc->initial_psn);
+	link->peer_mtu = add_llc->qp_mtu;
+}
+
+/* as an SMC client, process an add link request */
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
+{
+	struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
+	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+	struct smc_link_group *lgr = smc_get_lgr(link);
+	struct smc_link *lnk_new = NULL;
+	struct smc_init_info ini;
+	int lnk_idx, rc = 0;
+
+	ini.vlan_id = lgr->vlan_id;
+	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+	if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+	    !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) {
+		if (!ini.ib_dev)
+			goto out_reject;
+		lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+	}
+	if (!ini.ib_dev) {
+		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+		ini.ib_dev = link->smcibdev;
+		ini.ib_port = link->ibport;
+	}
+	lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+	if (lnk_idx < 0)
+		goto out_reject;
+	lnk_new = &lgr->lnk[lnk_idx];
+	rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini);
+	if (rc)
+		goto out_reject;
+	smc_llc_save_add_link_info(lnk_new, llc);
+	lnk_new->link_id = llc->link_num;
+
+	rc = smc_ib_ready_link(lnk_new);
+	if (rc)
+		goto out_clear_lnk;
+
+	rc = smcr_buf_map_lgr(lnk_new);
+	if (rc)
+		goto out_clear_lnk;
+
+	rc = smc_llc_send_add_link(link,
+				   lnk_new->smcibdev->mac[ini.ib_port - 1],
+				   lnk_new->gid, lnk_new, SMC_LLC_RESP);
+	if (rc)
+		goto out_clear_lnk;
+	/* tbd: rc = smc_llc_cli_rkey_exchange(link, lnk_new); */
+	if (rc) {
+		rc = 0;
+		goto out_clear_lnk;
+	}
+	/* tbd: rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); */
+	if (!rc)
+		goto out;
+out_clear_lnk:
+	smcr_link_clear(lnk_new);
+out_reject:
+	smc_llc_cli_add_link_reject(qentry);
+out:
+	kfree(qentry);
+	return rc;
+}
+
 /* worker to process an add link message */
 static void smc_llc_add_link_work(struct work_struct *work)
 {
-- 
cgit v1.2.3


From 87f88cda2128a72d79d4cc700729488af1081a06 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:41 +0200
Subject: net/smc: rkey processing for a new link as SMC client

Part of the SMC client new link establishment process is the exchange of
rkeys for all used buffers.
Add new LLC message type ADD_LINK_CONTINUE which is used to exchange
rkeys of all current RMB buffers. Add functions to iterate over all
used RMB buffers of the link group, and implement the ADD_LINK_CONTINUE
processing.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_llc.h |   1 +
 2 files changed, 157 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 8716d8739329..a06b618f172e 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -70,6 +70,23 @@ struct smc_llc_msg_add_link {		/* type 0x02 */
 	u8 reserved[8];
 };
 
+struct smc_llc_msg_add_link_cont_rt {
+	__be32 rmb_key;
+	__be32 rmb_key_new;
+	__be64 rmb_vaddr_new;
+};
+
+#define SMC_LLC_RKEYS_PER_CONT_MSG	2
+
+struct smc_llc_msg_add_link_cont {	/* type 0x03 */
+	struct smc_llc_hdr hd;
+	u8 link_num;
+	u8 num_rkeys;
+	u8 reserved2[2];
+	struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG];
+	u8 reserved[4];
+} __packed;			/* format defined in RFC7609 */
+
 #define SMC_LLC_FLAG_DEL_LINK_ALL	0x40
 #define SMC_LLC_FLAG_DEL_LINK_ORDERLY	0x20
 
@@ -121,6 +138,7 @@ struct smc_llc_msg_delete_rkey {	/* type 0x09 */
 union smc_llc_msg {
 	struct smc_llc_msg_confirm_link confirm_link;
 	struct smc_llc_msg_add_link add_link;
+	struct smc_llc_msg_add_link_cont add_link_cont;
 	struct smc_llc_msg_del_link delete_link;
 
 	struct smc_llc_msg_confirm_rkey confirm_rkey;
@@ -566,6 +584,137 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
 	return -EMLINK;
 }
 
+/* return first buffer from any of the next buf lists */
+static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
+						  int *buf_lst)
+{
+	struct smc_buf_desc *buf_pos;
+
+	while (*buf_lst < SMC_RMBE_SIZES) {
+		buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
+						   struct smc_buf_desc, list);
+		if (buf_pos)
+			return buf_pos;
+		(*buf_lst)++;
+	}
+	return NULL;
+}
+
+/* return next rmb from buffer lists */
+static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
+						 int *buf_lst,
+						 struct smc_buf_desc *buf_pos)
+{
+	struct smc_buf_desc *buf_next;
+
+	if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
+		(*buf_lst)++;
+		return _smc_llc_get_next_rmb(lgr, buf_lst);
+	}
+	buf_next = list_next_entry(buf_pos, list);
+	return buf_next;
+}
+
+static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
+						  int *buf_lst)
+{
+	*buf_lst = 0;
+	return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
+}
+
+/* send one add_link_continue msg */
+static int smc_llc_add_link_cont(struct smc_link *link,
+				 struct smc_link *link_new, u8 *num_rkeys_todo,
+				 int *buf_lst, struct smc_buf_desc **buf_pos)
+{
+	struct smc_llc_msg_add_link_cont *addc_llc;
+	struct smc_link_group *lgr = link->lgr;
+	int prim_lnk_idx, lnk_idx, i, rc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	struct smc_buf_desc *rmb;
+	u8 n;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
+	memset(addc_llc, 0, sizeof(*addc_llc));
+
+	prim_lnk_idx = link->link_idx;
+	lnk_idx = link_new->link_idx;
+	addc_llc->link_num = link_new->link_id;
+	addc_llc->num_rkeys = *num_rkeys_todo;
+	n = *num_rkeys_todo;
+	for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) {
+		if (!*buf_pos) {
+			addc_llc->num_rkeys = addc_llc->num_rkeys -
+					      *num_rkeys_todo;
+			*num_rkeys_todo = 0;
+			break;
+		}
+		rmb = *buf_pos;
+
+		addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey);
+		addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey);
+		addc_llc->rt[i].rmb_vaddr_new =
+			cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+
+		(*num_rkeys_todo)--;
+		*buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+		while (*buf_pos && !(*buf_pos)->used)
+			*buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+	}
+	addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT;
+	addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
+	if (lgr->role == SMC_CLNT)
+		addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	return smc_wr_tx_send(link, pend);
+}
+
+static int smc_llc_cli_rkey_exchange(struct smc_link *link,
+				     struct smc_link *link_new)
+{
+	struct smc_llc_msg_add_link_cont *addc_llc;
+	struct smc_link_group *lgr = link->lgr;
+	u8 max, num_rkeys_send, num_rkeys_recv;
+	struct smc_llc_qentry *qentry;
+	struct smc_buf_desc *buf_pos;
+	int buf_lst;
+	int rc = 0;
+	int i;
+
+	mutex_lock(&lgr->rmbs_lock);
+	num_rkeys_send = lgr->conns_num;
+	buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+	do {
+		qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME,
+				      SMC_LLC_ADD_LINK_CONT);
+		if (!qentry) {
+			rc = -ETIMEDOUT;
+			break;
+		}
+		addc_llc = &qentry->msg.add_link_cont;
+		num_rkeys_recv = addc_llc->num_rkeys;
+		max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+		for (i = 0; i < max; i++) {
+			smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+				       addc_llc->rt[i].rmb_key,
+				       addc_llc->rt[i].rmb_vaddr_new,
+				       addc_llc->rt[i].rmb_key_new);
+			num_rkeys_recv--;
+		}
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+		rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+					   &buf_lst, &buf_pos);
+		if (rc)
+			break;
+	} while (num_rkeys_send || num_rkeys_recv);
+
+	mutex_unlock(&lgr->rmbs_lock);
+	return rc;
+}
+
 /* prepare and send an add link reject response */
 static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
 {
@@ -631,7 +780,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 				   lnk_new->gid, lnk_new, SMC_LLC_RESP);
 	if (rc)
 		goto out_clear_lnk;
-	/* tbd: rc = smc_llc_cli_rkey_exchange(link, lnk_new); */
+	rc = smc_llc_cli_rkey_exchange(link, lnk_new);
 	if (rc) {
 		rc = 0;
 		goto out_clear_lnk;
@@ -794,6 +943,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 		}
 		return;
 	case SMC_LLC_CONFIRM_LINK:
+	case SMC_LLC_ADD_LINK_CONT:
 		if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
 			/* a flow is waiting for this message */
 			smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
@@ -873,6 +1023,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 		break;
 	case SMC_LLC_ADD_LINK:
 	case SMC_LLC_CONFIRM_LINK:
+	case SMC_LLC_ADD_LINK_CONT:
 	case SMC_LLC_CONFIRM_RKEY:
 	case SMC_LLC_DELETE_RKEY:
 		/* assign responses to the local flow, we requested them */
@@ -1092,6 +1243,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_ADD_LINK
 	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_ADD_LINK_CONT
+	},
 	{
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_DELETE_LINK
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 4ed4486e5082..97a4f02f5a93 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -28,6 +28,7 @@ enum smc_llc_reqresp {
 enum smc_llc_msg_type {
 	SMC_LLC_CONFIRM_LINK		= 0x01,
 	SMC_LLC_ADD_LINK		= 0x02,
+	SMC_LLC_ADD_LINK_CONT		= 0x03,
 	SMC_LLC_DELETE_LINK		= 0x04,
 	SMC_LLC_CONFIRM_RKEY		= 0x06,
 	SMC_LLC_TEST_LINK		= 0x07,
-- 
cgit v1.2.3


From b1570a87f57e94e9f74b8942840f9bd16bd1aba5 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:42 +0200
Subject: net/smc: final part of add link processing as SMC client

This patch finalizes the ADD_LINK processing of new links. Receive the
CONFIRM_LINK request from peer, complete the link initialization,
register all used buffers with the IB device and finally send the
CONFIRM_LINK response, which completes the ADD_LINK processing.
And activate smc_llc_cli_add_link() in af_smc.c.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  |  2 +-
 net/smc/smc_llc.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 net/smc/smc_llc.h |  1 +
 3 files changed, 72 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6663a63be9e4..1afb6e4275f2 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -427,7 +427,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 		return rc;
 	}
 	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
-	/* tbd: call smc_llc_cli_add_link(link, qentry); */
+	smc_llc_cli_add_link(link, qentry);
 	return 0;
 }
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index a06b618f172e..d56ca60597d4 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -381,7 +381,7 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 	hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
 	confllc->link_num = link->link_id;
 	memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
-	confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */
+	confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -724,6 +724,61 @@ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
 	return smc_llc_send_message(qentry->link, &qentry->msg);
 }
 
+static int smc_llc_cli_conf_link(struct smc_link *link,
+				 struct smc_init_info *ini,
+				 struct smc_link *link_new,
+				 enum smc_lgr_type lgr_new_t)
+{
+	struct smc_link_group *lgr = link->lgr;
+	struct smc_llc_msg_del_link *del_llc;
+	struct smc_llc_qentry *qentry = NULL;
+	int rc = 0;
+
+	/* receive CONFIRM LINK request over RoCE fabric */
+	qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0);
+	if (!qentry) {
+		rc = smc_llc_send_delete_link(link, link_new->link_id,
+					      SMC_LLC_REQ, false,
+					      SMC_LLC_DEL_LOST_PATH);
+		return -ENOLINK;
+	}
+	if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
+		/* received DELETE_LINK instead */
+		del_llc = &qentry->msg.delete_link;
+		qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+		smc_llc_send_message(link, &qentry->msg);
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+		return -ENOLINK;
+	}
+	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+	rc = smc_ib_modify_qp_rts(link_new);
+	if (rc) {
+		smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+					 false, SMC_LLC_DEL_LOST_PATH);
+		return -ENOLINK;
+	}
+	smc_wr_remember_qp_attr(link_new);
+
+	rc = smcr_buf_reg_lgr(link_new);
+	if (rc) {
+		smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+					 false, SMC_LLC_DEL_LOST_PATH);
+		return -ENOLINK;
+	}
+
+	/* send CONFIRM LINK response over RoCE fabric */
+	rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP);
+	if (rc) {
+		smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+					 false, SMC_LLC_DEL_LOST_PATH);
+		return -ENOLINK;
+	}
+	smc_llc_link_active(link_new);
+	lgr->type = lgr_new_t;
+	return 0;
+}
+
 static void smc_llc_save_add_link_info(struct smc_link *link,
 				       struct smc_llc_msg_add_link *add_llc)
 {
@@ -785,7 +840,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 		rc = 0;
 		goto out_clear_lnk;
 	}
-	/* tbd: rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); */
+	rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t);
 	if (!rc)
 		goto out;
 out_clear_lnk:
@@ -797,6 +852,17 @@ out:
 	return rc;
 }
 
+static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
+{
+	struct smc_llc_qentry *qentry;
+
+	qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+
+	mutex_lock(&lgr->llc_conf_mutex);
+	smc_llc_cli_add_link(qentry->link, qentry);
+	mutex_unlock(&lgr->llc_conf_mutex);
+}
+
 /* worker to process an add link message */
 static void smc_llc_add_link_work(struct work_struct *work)
 {
@@ -809,7 +875,8 @@ static void smc_llc_add_link_work(struct work_struct *work)
 		goto out;
 	}
 
-	/* tbd: call smc_llc_process_cli_add_link(lgr); */
+	if (lgr->role == SMC_CLNT)
+		smc_llc_process_cli_add_link(lgr);
 	/* tbd: call smc_llc_process_srv_add_link(lgr); */
 out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 97a4f02f5a93..7c314bbef8c8 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -88,6 +88,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 				    int time_out, u8 exp_msg);
 struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
 void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From 2d2209f2018943d4152a21eff5b76f1952e0b435 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:43 +0200
Subject: net/smc: first part of add link processing as SMC server

First set of functions to process an ADD_LINK LLC request as an SMC
server. Find an alternate IB device, determine the new link group type
and get the index for the new link. Then initialize the link and send
the ADD_LINK LLC message to the peer. Save the contents of the response,
ready the link, map all used buffers and register the buffers with the
IB device. If any error occurs, stop the processing and clear the link.
And call smc_llc_srv_add_link() in af_smc.c to start second link
establishment after the initial link of a link group was created.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  |  2 +-
 net/smc/smc_llc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_llc.h |  1 +
 3 files changed, 92 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1afb6e4275f2..c67272007f41 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1067,7 +1067,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 	smc_llc_link_active(link);
 
 	/* initial contact - try to establish second link */
-	/* tbd: call smc_llc_srv_add_link(link); */
+	smc_llc_srv_add_link(link);
 	return 0;
 }
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index d56ca60597d4..e2f254e21759 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -863,6 +863,94 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
 
+int smc_llc_srv_add_link(struct smc_link *link)
+{
+	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+	struct smc_link_group *lgr = link->lgr;
+	struct smc_llc_msg_add_link *add_llc;
+	struct smc_llc_qentry *qentry = NULL;
+	struct smc_link *link_new;
+	struct smc_init_info ini;
+	int lnk_idx, rc = 0;
+
+	/* ignore client add link recommendation, start new flow */
+	ini.vlan_id = lgr->vlan_id;
+	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+	if (!ini.ib_dev) {
+		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+		ini.ib_dev = link->smcibdev;
+		ini.ib_port = link->ibport;
+	}
+	lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+	if (lnk_idx < 0)
+		return 0;
+
+	rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini);
+	if (rc)
+		return rc;
+	link_new = &lgr->lnk[lnk_idx];
+	rc = smc_llc_send_add_link(link,
+				   link_new->smcibdev->mac[ini.ib_port - 1],
+				   link_new->gid, link_new, SMC_LLC_REQ);
+	if (rc)
+		goto out_err;
+	/* receive ADD LINK response over the RoCE fabric */
+	qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
+	if (!qentry) {
+		rc = -ETIMEDOUT;
+		goto out_err;
+	}
+	add_llc = &qentry->msg.add_link;
+	if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) {
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+		rc = -ENOLINK;
+		goto out_err;
+	}
+	if (lgr->type == SMC_LGR_SINGLE &&
+	    (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+	     !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+		lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+	}
+	smc_llc_save_add_link_info(link_new, add_llc);
+	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+	rc = smc_ib_ready_link(link_new);
+	if (rc)
+		goto out_err;
+	rc = smcr_buf_map_lgr(link_new);
+	if (rc)
+		goto out_err;
+	rc = smcr_buf_reg_lgr(link_new);
+	if (rc)
+		goto out_err;
+	/* tbd: rc = smc_llc_srv_rkey_exchange(link, link_new); */
+	if (rc)
+		goto out_err;
+	/* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */
+	if (rc)
+		goto out_err;
+	return 0;
+out_err:
+	smcr_link_clear(link_new);
+	return rc;
+}
+
+static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
+{
+	struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
+	int rc;
+
+	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+	mutex_lock(&lgr->llc_conf_mutex);
+	rc = smc_llc_srv_add_link(link);
+	if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
+		/* delete any asymmetric link */
+		/* tbd: smc_llc_delete_asym_link(lgr); */
+	}
+	mutex_unlock(&lgr->llc_conf_mutex);
+}
+
 /* worker to process an add link message */
 static void smc_llc_add_link_work(struct work_struct *work)
 {
@@ -877,7 +965,8 @@ static void smc_llc_add_link_work(struct work_struct *work)
 
 	if (lgr->role == SMC_CLNT)
 		smc_llc_process_cli_add_link(lgr);
-	/* tbd: call smc_llc_process_srv_add_link(lgr); */
+	else
+		smc_llc_process_srv_add_link(lgr);
 out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 7c314bbef8c8..1a7748d0541f 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -89,6 +89,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
 void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
 int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
+int smc_llc_srv_add_link(struct smc_link *link);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From 57b499242cb888a32815f8663b60338bcb0b5747 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:44 +0200
Subject: net/smc: rkey processing for a new link as SMC server

Part of SMC server new link establishment is the exchange of rkeys for
used buffers.
Loop over all used RMB buffers and send ADD_LINK_CONTINUE LLC messages
to the peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e2f254e21759..de73432bd72f 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -863,6 +863,47 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
 
+static int smc_llc_srv_rkey_exchange(struct smc_link *link,
+				     struct smc_link *link_new)
+{
+	struct smc_llc_msg_add_link_cont *addc_llc;
+	struct smc_link_group *lgr = link->lgr;
+	u8 max, num_rkeys_send, num_rkeys_recv;
+	struct smc_llc_qentry *qentry = NULL;
+	struct smc_buf_desc *buf_pos;
+	int buf_lst;
+	int rc = 0;
+	int i;
+
+	mutex_lock(&lgr->rmbs_lock);
+	num_rkeys_send = lgr->conns_num;
+	buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+	do {
+		smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+				      &buf_lst, &buf_pos);
+		qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME,
+				      SMC_LLC_ADD_LINK_CONT);
+		if (!qentry) {
+			rc = -ETIMEDOUT;
+			goto out;
+		}
+		addc_llc = &qentry->msg.add_link_cont;
+		num_rkeys_recv = addc_llc->num_rkeys;
+		max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+		for (i = 0; i < max; i++) {
+			smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+				       addc_llc->rt[i].rmb_key,
+				       addc_llc->rt[i].rmb_vaddr_new,
+				       addc_llc->rt[i].rmb_key_new);
+			num_rkeys_recv--;
+		}
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+	} while (num_rkeys_send || num_rkeys_recv);
+out:
+	mutex_unlock(&lgr->rmbs_lock);
+	return rc;
+}
+
 int smc_llc_srv_add_link(struct smc_link *link)
 {
 	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
@@ -923,7 +964,7 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	rc = smcr_buf_reg_lgr(link_new);
 	if (rc)
 		goto out_err;
-	/* tbd: rc = smc_llc_srv_rkey_exchange(link, link_new); */
+	rc = smc_llc_srv_rkey_exchange(link, link_new);
 	if (rc)
 		goto out_err;
 	/* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */
-- 
cgit v1.2.3


From 1551c95b61242b1a20565bae8d711f35a601c4f3 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:45 +0200
Subject: net/smc: final part of add link processing as SMC server

This patch finalizes the ADD_LINK processing of new links. Send the
CONFIRM_LINK request to the peer, receive the response and set link
state to ACTIVE.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index de73432bd72f..1fefee55e293 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -904,6 +904,33 @@ out:
 	return rc;
 }
 
+static int smc_llc_srv_conf_link(struct smc_link *link,
+				 struct smc_link *link_new,
+				 enum smc_lgr_type lgr_new_t)
+{
+	struct smc_link_group *lgr = link->lgr;
+	struct smc_llc_qentry *qentry = NULL;
+	int rc;
+
+	/* send CONFIRM LINK request over the RoCE fabric */
+	rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ);
+	if (rc)
+		return -ENOLINK;
+	/* receive CONFIRM LINK response over the RoCE fabric */
+	qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME,
+			      SMC_LLC_CONFIRM_LINK);
+	if (!qentry) {
+		/* send DELETE LINK */
+		smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+					 false, SMC_LLC_DEL_LOST_PATH);
+		return -ENOLINK;
+	}
+	smc_llc_link_active(link_new);
+	lgr->type = lgr_new_t;
+	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+	return 0;
+}
+
 int smc_llc_srv_add_link(struct smc_link *link)
 {
 	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
@@ -967,7 +994,7 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	rc = smc_llc_srv_rkey_exchange(link, link_new);
 	if (rc)
 		goto out_err;
-	/* tbd: rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); */
+	rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
 	if (rc)
 		goto out_err;
 	return 0;
-- 
cgit v1.2.3


From c9a5d243035161f06175a7c6d487c9860e0f179a Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:46 +0200
Subject: net/smc: delete an asymmetric link as SMC server

When a link group moved from asymmetric to symmetric state then the
dangling asymmetric link can be deleted. Add smc_llc_find_asym_link() to
find the respective link and add smc_llc_delete_asym_link() to delete
it.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_wr.c  |  2 +-
 net/smc/smc_wr.h  |  1 +
 3 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 1fefee55e293..9d102c912be9 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -863,6 +863,85 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
 
+/* find the asymmetric link when 3 links are established  */
+static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
+{
+	int asym_idx = -ENOENT;
+	int i, j, k;
+	bool found;
+
+	/* determine asymmetric link */
+	found = false;
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+			if (!smc_link_usable(&lgr->lnk[i]) ||
+			    !smc_link_usable(&lgr->lnk[j]))
+				continue;
+			if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid,
+				    SMC_GID_SIZE)) {
+				found = true;	/* asym_lnk is i or j */
+				break;
+			}
+		}
+		if (found)
+			break;
+	}
+	if (!found)
+		goto out; /* no asymmetric link */
+	for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) {
+		if (!smc_link_usable(&lgr->lnk[k]))
+			continue;
+		if (k != i &&
+		    !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid,
+			    SMC_GID_SIZE)) {
+			asym_idx = i;
+			break;
+		}
+		if (k != j &&
+		    !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid,
+			    SMC_GID_SIZE)) {
+			asym_idx = j;
+			break;
+		}
+	}
+out:
+	return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx];
+}
+
+static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
+{
+	struct smc_link *lnk_new = NULL, *lnk_asym;
+	struct smc_llc_qentry *qentry;
+	int rc;
+
+	lnk_asym = smc_llc_find_asym_link(lgr);
+	if (!lnk_asym)
+		return; /* no asymmetric link */
+	if (!smc_link_downing(&lnk_asym->state))
+		return;
+	/* tbd: lnk_new = smc_switch_conns(lgr, lnk_asym, false); */
+	smc_wr_tx_wait_no_pending_sends(lnk_asym);
+	if (!lnk_new)
+		goto out_free;
+	/* change flow type from ADD_LINK into DEL_LINK */
+	lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK;
+	rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ,
+				      true, SMC_LLC_DEL_NO_ASYM_NEEDED);
+	if (rc) {
+		smcr_link_down_cond(lnk_new);
+		goto out_free;
+	}
+	qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_DELETE_LINK);
+	if (!qentry) {
+		smcr_link_down_cond(lnk_new);
+		goto out_free;
+	}
+	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+out_free:
+	smcr_link_clear(lnk_asym);
+}
+
 static int smc_llc_srv_rkey_exchange(struct smc_link *link,
 				     struct smc_link *link_new)
 {
@@ -1014,7 +1093,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
 	rc = smc_llc_srv_add_link(link);
 	if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
 		/* delete any asymmetric link */
-		/* tbd: smc_llc_delete_asym_link(lgr); */
+		smc_llc_delete_asym_link(lgr);
 	}
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 031e6c9561b1..3fd27bea4f7a 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -61,7 +61,7 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link)
 }
 
 /* wait till all pending tx work requests on the given link are completed */
-static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
 {
 	if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
 			       SMC_WR_TX_WAIT_PENDING_TIME))
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 3ac99c898418..f7eaeb3391f3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -106,6 +106,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
 			     smc_wr_tx_filter filter,
 			     smc_wr_tx_dismisser dismisser,
 			     unsigned long data);
+int smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
 
 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
 int smc_wr_rx_post_init(struct smc_link *link);
-- 
cgit v1.2.3


From 9ec6bf19ec8bb19f4211f6a2bf62c079d46b54ea Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:47 +0200
Subject: net/smc: llc_del_link_work and use the LLC flow for delete link

Introduce a work that is scheduled when a new DELETE_LINK LLC request is
received. The work will call either the SMC client or SMC server
DELETE_LINK processing.
And use the LLC flow framework to process incoming DELETE_LINK LLC
messages, scheduling the llc_del_link_work for those events.
With these changes smc_lgr_forget() is only called by one function and
can be migrated into smc_lgr_cleanup_early().

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 22 ++++++++--------------
 net/smc/smc_core.h |  2 +-
 net/smc/smc_llc.c  | 55 +++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 45 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2f8faa9c9e8e..a964304283fa 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -193,12 +193,19 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
 void smc_lgr_cleanup_early(struct smc_connection *conn)
 {
 	struct smc_link_group *lgr = conn->lgr;
+	struct list_head *lgr_list;
+	spinlock_t *lgr_lock;
 
 	if (!lgr)
 		return;
 
 	smc_conn_free(conn);
-	smc_lgr_forget(lgr);
+	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
+	spin_lock_bh(lgr_lock);
+	/* do not use this link group for new connections */
+	if (!list_empty(lgr_list))
+		list_del_init(lgr_list);
+	spin_unlock_bh(lgr_lock);
 	smc_lgr_schedule_free_work_fast(lgr);
 }
 
@@ -653,19 +660,6 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 	kfree(lgr);
 }
 
-void smc_lgr_forget(struct smc_link_group *lgr)
-{
-	struct list_head *lgr_list;
-	spinlock_t *lgr_lock;
-
-	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
-	spin_lock_bh(lgr_lock);
-	/* do not use this link group for new connections */
-	if (!list_empty(lgr_list))
-		list_del_init(lgr_list);
-	spin_unlock_bh(lgr_lock);
-}
-
 static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
 {
 	int i;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 4e00819e2db7..7fe53feb9dc4 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -254,6 +254,7 @@ struct smc_link_group {
 			struct mutex		llc_conf_mutex;
 						/* protects lgr reconfig. */
 			struct work_struct	llc_add_link_work;
+			struct work_struct	llc_del_link_work;
 			struct work_struct	llc_event_work;
 						/* llc event worker */
 			wait_queue_head_t	llc_waiter;
@@ -343,7 +344,6 @@ struct smc_sock;
 struct smc_clc_msg_accept_confirm;
 struct smc_clc_msg_local;
 
-void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_cleanup_early(struct smc_connection *conn);
 void smc_lgr_terminate_sched(struct smc_link_group *lgr);
 void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 9d102c912be9..e4e3910a9624 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1118,22 +1118,18 @@ out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
 
-static void smc_llc_rx_delete_link(struct smc_link *link,
-				   struct smc_llc_msg_del_link *llc)
+static void smc_llc_delete_link_work(struct work_struct *work)
 {
-	struct smc_link_group *lgr = smc_get_lgr(link);
+	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+						  llc_del_link_work);
 
-	smc_lgr_forget(lgr);
-	if (lgr->role == SMC_SERV) {
-		/* client asks to delete this link, send request */
-		smc_llc_send_delete_link(link, 0, SMC_LLC_REQ, true,
-					 SMC_LLC_DEL_PROG_INIT_TERM);
-	} else {
-		/* server requests to delete this link, send response */
-		smc_llc_send_delete_link(link, 0, SMC_LLC_RESP, true,
-					 SMC_LLC_DEL_PROG_INIT_TERM);
+	if (list_empty(&lgr->list)) {
+		/* link group is terminating */
+		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+		goto out;
 	}
-	smcr_link_down_cond(link);
+out:
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
 
 /* process a confirm_rkey request from peer, remote flow */
@@ -1255,8 +1251,30 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 		}
 		break;
 	case SMC_LLC_DELETE_LINK:
-		smc_llc_rx_delete_link(link, &llc->delete_link);
-		break;
+		if (lgr->role == SMC_CLNT) {
+			/* server requests to delete this link, send response */
+			if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+				/* DEL LINK REQ during ADD LINK SEQ */
+				smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+							qentry);
+				wake_up_interruptible(&lgr->llc_waiter);
+			} else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+						      qentry)) {
+				schedule_work(&lgr->llc_del_link_work);
+			}
+		} else {
+			if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+			    !lgr->llc_flow_lcl.qentry) {
+				/* DEL LINK REQ during ADD LINK SEQ */
+				smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+							qentry);
+				wake_up_interruptible(&lgr->llc_waiter);
+			} else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+						      qentry)) {
+				schedule_work(&lgr->llc_del_link_work);
+			}
+		}
+		return;
 	case SMC_LLC_CONFIRM_RKEY:
 		/* new request from remote, assign to remote flow */
 		if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
@@ -1325,6 +1343,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 			complete(&link->llc_testlink_resp);
 		break;
 	case SMC_LLC_ADD_LINK:
+	case SMC_LLC_DELETE_LINK:
 	case SMC_LLC_CONFIRM_LINK:
 	case SMC_LLC_ADD_LINK_CONT:
 	case SMC_LLC_CONFIRM_RKEY:
@@ -1333,10 +1352,6 @@ static void smc_llc_rx_response(struct smc_link *link,
 		smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
 		wake_up_interruptible(&link->lgr->llc_waiter);
 		return;
-	case SMC_LLC_DELETE_LINK:
-		if (link->lgr->role == SMC_SERV)
-			smc_lgr_schedule_free_work_fast(link->lgr);
-		break;
 	case SMC_LLC_CONFIRM_RKEY_CONT:
 		/* not used because max links is 3 */
 		break;
@@ -1424,6 +1439,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
 
 	INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
 	INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
+	INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work);
 	INIT_LIST_HEAD(&lgr->llc_event_q);
 	spin_lock_init(&lgr->llc_event_q_lock);
 	spin_lock_init(&lgr->llc_flow_lock);
@@ -1439,6 +1455,7 @@ void smc_llc_lgr_clear(struct smc_link_group *lgr)
 	wake_up_interruptible_all(&lgr->llc_waiter);
 	cancel_work_sync(&lgr->llc_event_work);
 	cancel_work_sync(&lgr->llc_add_link_work);
+	cancel_work_sync(&lgr->llc_del_link_work);
 	if (lgr->delayed_event) {
 		kfree(lgr->delayed_event);
 		lgr->delayed_event = NULL;
-- 
cgit v1.2.3


From 9c4168789cc635e1f0d265157b7617259d56bfee Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:48 +0200
Subject: net/smc: delete link processing as SMC client

Add smc_llc_process_cli_delete_link() to process a DELETE_LINK request
as SMC client. When the request is to delete ALL links then terminate
the whole link group. If not, find the link to delete by its link_id,
send the DELETE_LINK response LLC message and then clear the deleted
link. Finally determine and update the link group state.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e4e3910a9624..cd57b4fb1842 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -863,6 +863,18 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
 
+static int smc_llc_active_link_count(struct smc_link_group *lgr)
+{
+	int i, link_count = 0;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (!smc_link_usable(&lgr->lnk[i]))
+			continue;
+		link_count++;
+	}
+	return link_count;
+}
+
 /* find the asymmetric link when 3 links are established  */
 static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
 {
@@ -1118,6 +1130,63 @@ out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
 
+static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
+{
+	struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
+	struct smc_llc_msg_del_link *del_llc;
+	struct smc_llc_qentry *qentry;
+	int active_links;
+	int lnk_idx;
+
+	qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+	lnk = qentry->link;
+	del_llc = &qentry->msg.delete_link;
+
+	if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+		smc_lgr_terminate_sched(lgr);
+		goto out;
+	}
+	mutex_lock(&lgr->llc_conf_mutex);
+	/* delete single link */
+	for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
+		if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
+			continue;
+		lnk_del = &lgr->lnk[lnk_idx];
+		break;
+	}
+	del_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	if (!lnk_del) {
+		/* link was not found */
+		del_llc->reason = htonl(SMC_LLC_DEL_NOLNK);
+		smc_llc_send_message(lnk, &qentry->msg);
+		goto out_unlock;
+	}
+	lnk_asym = smc_llc_find_asym_link(lgr);
+
+	del_llc->reason = 0;
+	smc_llc_send_message(lnk, &qentry->msg); /* response */
+
+	if (smc_link_downing(&lnk_del->state)) {
+		/* tbd: call smc_switch_conns(lgr, lnk_del, false); */
+		smc_wr_tx_wait_no_pending_sends(lnk_del);
+	}
+	smcr_link_clear(lnk_del);
+
+	active_links = smc_llc_active_link_count(lgr);
+	if (lnk_del == lnk_asym) {
+		/* expected deletion of asym link, don't change lgr state */
+	} else if (active_links == 1) {
+		lgr->type = SMC_LGR_SINGLE;
+	} else if (!active_links) {
+		lgr->type = SMC_LGR_NONE;
+		smc_lgr_terminate_sched(lgr);
+	}
+out_unlock:
+	mutex_unlock(&lgr->llc_conf_mutex);
+out:
+	kfree(qentry);
+}
+
 static void smc_llc_delete_link_work(struct work_struct *work)
 {
 	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
@@ -1128,6 +1197,9 @@ static void smc_llc_delete_link_work(struct work_struct *work)
 		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 		goto out;
 	}
+
+	if (lgr->role == SMC_CLNT)
+		smc_llc_process_cli_delete_link(lgr);
 out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
-- 
cgit v1.2.3


From 08ae27ddfb6514a8316b17256cd4262bb6931c1f Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:49 +0200
Subject: net/smc: delete link processing as SMC server

Add smc_llc_process_srv_delete_link() to process a DELETE_LINK request
as SMC server. When the request is to delete ALL links then terminate
the whole link group. If not, find the link to delete by its link_id,
send the DELETE_LINK request LLC message and wait for the response.
No matter if a response was received, clear the deleted link and update
the link group state.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index cd57b4fb1842..ac065f6d60dc 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1187,6 +1187,76 @@ out:
 	kfree(qentry);
 }
 
+static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
+{
+	struct smc_llc_msg_del_link *del_llc;
+	struct smc_link *lnk, *lnk_del;
+	struct smc_llc_qentry *qentry;
+	int active_links;
+	int i;
+
+	mutex_lock(&lgr->llc_conf_mutex);
+	qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+	lnk = qentry->link;
+	del_llc = &qentry->msg.delete_link;
+
+	if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+		/* delete entire lgr */
+		smc_lgr_terminate_sched(lgr);
+		goto out;
+	}
+	/* delete single link */
+	lnk_del = NULL;
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (lgr->lnk[i].link_id == del_llc->link_num) {
+			lnk_del = &lgr->lnk[i];
+			break;
+		}
+	}
+	if (!lnk_del)
+		goto out; /* asymmetric link already deleted */
+
+	if (smc_link_downing(&lnk_del->state)) {
+		/* tbd: call smc_switch_conns(lgr, lnk_del, false); */
+		smc_wr_tx_wait_no_pending_sends(lnk_del);
+	}
+	if (!list_empty(&lgr->list)) {
+		/* qentry is either a request from peer (send it back to
+		 * initiate the DELETE_LINK processing), or a locally
+		 * enqueued DELETE_LINK request (forward it)
+		 */
+		if (!smc_llc_send_message(lnk, &qentry->msg)) {
+			struct smc_llc_msg_del_link *del_llc_resp;
+			struct smc_llc_qentry *qentry2;
+
+			qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
+					       SMC_LLC_DELETE_LINK);
+			if (!qentry2) {
+			} else {
+				del_llc_resp = &qentry2->msg.delete_link;
+				smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+			}
+		}
+	}
+	smcr_link_clear(lnk_del);
+
+	active_links = smc_llc_active_link_count(lgr);
+	if (active_links == 1) {
+		lgr->type = SMC_LGR_SINGLE;
+	} else if (!active_links) {
+		lgr->type = SMC_LGR_NONE;
+		smc_lgr_terminate_sched(lgr);
+	}
+
+	if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
+		/* trigger setup of asymm alt link */
+		/* tbd: call smc_llc_srv_add_link_local(lnk); */
+	}
+out:
+	mutex_unlock(&lgr->llc_conf_mutex);
+	kfree(qentry);
+}
+
 static void smc_llc_delete_link_work(struct work_struct *work)
 {
 	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
@@ -1200,6 +1270,8 @@ static void smc_llc_delete_link_work(struct work_struct *work)
 
 	if (lgr->role == SMC_CLNT)
 		smc_llc_process_cli_delete_link(lgr);
+	else
+		smc_llc_process_srv_delete_link(lgr);
 out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
-- 
cgit v1.2.3


From 4dadd151b26589fd0520feb97c93ee981b393a99 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sun, 3 May 2020 14:38:50 +0200
Subject: net/smc: enqueue local LLC messages

As SMC server, when a second link was deleted, trigger the setup of an
asymmetric link. Do this by enqueueing a local ADD_LINK message which
is processed by the LLC layer as if it were received from peer. Do the
same when a new IB port became active and a new link could be created.
smc_llc_srv_add_link_local() enqueues a local ADD_LINK message.
And smc_llc_srv_delete_link_local() is used the same way to enqueue a
local DELETE_LINK message. This is used when an IB port is no longer
active.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  3 ++-
 net/smc/smc_llc.c  | 30 +++++++++++++++++++++++++++++-
 net/smc/smc_llc.h  |  2 ++
 3 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a964304283fa..32a6cadc5c1f 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -883,7 +883,7 @@ static void smcr_link_up(struct smc_link_group *lgr,
 		link = smc_llc_usable_link(lgr);
 		if (!link)
 			return;
-		/* tbd: call smc_llc_srv_add_link_local(link); */
+		smc_llc_srv_add_link_local(link);
 	} else {
 		/* invite server to start add link processing */
 		u8 gid[SMC_GID_SIZE];
@@ -954,6 +954,7 @@ static void smcr_link_down(struct smc_link *lnk)
 
 	if (lgr->role == SMC_SERV) {
 		/* trigger local delete link processing */
+		smc_llc_srv_delete_link_local(to_lnk, del_link_id);
 	} else {
 		if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
 			/* another llc task is ongoing */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ac065f6d60dc..7675ccd6f3c3 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -159,6 +159,8 @@ struct smc_llc_qentry {
 	union smc_llc_msg msg;
 };
 
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc);
+
 struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
 {
 	struct smc_llc_qentry *qentry = flow->qentry;
@@ -1110,6 +1112,17 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
 	mutex_unlock(&lgr->llc_conf_mutex);
 }
 
+/* enqueue a local add_link req to trigger a new add_link flow, only as SERV */
+void smc_llc_srv_add_link_local(struct smc_link *link)
+{
+	struct smc_llc_msg_add_link add_llc = {0};
+
+	add_llc.hd.length = sizeof(add_llc);
+	add_llc.hd.common.type = SMC_LLC_ADD_LINK;
+	/* no dev and port needed, we as server ignore client data anyway */
+	smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
+}
+
 /* worker to process an add link message */
 static void smc_llc_add_link_work(struct work_struct *work)
 {
@@ -1130,6 +1143,21 @@ out:
 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
 
+/* enqueue a local del_link msg to trigger a new del_link flow,
+ * called only for role SMC_SERV
+ */
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
+{
+	struct smc_llc_msg_del_link del_llc = {0};
+
+	del_llc.hd.length = sizeof(del_llc);
+	del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
+	del_llc.link_num = del_link_id;
+	del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
+	del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+	smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc);
+}
+
 static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
 {
 	struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
@@ -1250,7 +1278,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 
 	if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
 		/* trigger setup of asymm alt link */
-		/* tbd: call smc_llc_srv_add_link_local(lnk); */
+		smc_llc_srv_add_link_local(lnk);
 	}
 out:
 	mutex_unlock(&lgr->llc_conf_mutex);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 1a7748d0541f..c335fc5f363c 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -69,6 +69,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
 			     enum smc_llc_reqresp reqresp, bool orderly,
 			     u32 reason);
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id);
 void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
 void smc_llc_lgr_clear(struct smc_link_group *lgr);
 int smc_llc_link_init(struct smc_link *link);
@@ -90,6 +91,7 @@ struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
 void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
 int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
 int smc_llc_srv_add_link(struct smc_link *link);
+void smc_llc_srv_add_link_local(struct smc_link *link);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From f0ec4f1d32ad49a23b93156949208dd9348e3590 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:37 +0200
Subject: net/smc: save state of last sent CDC message

When a link goes down and all connections of this link need to be
switched to an other link then the producer cursor and the sequence of
the last successfully sent CDC message must be known. Add the two fields
to the SMC connection and update it in the tx completion handler.
And to allow matching of sequences in error cases reset the seqno to the
old value in smc_cdc_msg_send() when the actual send failed.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc.h     | 4 ++++
 net/smc/smc_cdc.c | 6 ++++++
 2 files changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc.h b/net/smc/smc.h
index 1a084afa7372..1e9113771600 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -143,6 +143,9 @@ struct smc_connection {
 						 * .prod cf. TCP snd_nxt
 						 * .cons cf. TCP sends ack
 						 */
+	union smc_host_cursor	local_tx_ctrl_fin;
+						/* prod crsr - confirmed by peer
+						 */
 	union smc_host_cursor	tx_curs_prep;	/* tx - prepared data
 						 * snd_max..wmem_alloc
 						 */
@@ -154,6 +157,7 @@ struct smc_connection {
 						 */
 	atomic_t		sndbuf_space;	/* remaining space in sndbuf */
 	u16			tx_cdc_seq;	/* sequence # for CDC send */
+	u16			tx_cdc_seq_fin;	/* sequence # - tx completed */
 	spinlock_t		send_lock;	/* protect wr_sends */
 	struct delayed_work	tx_work;	/* retry of smc_cdc_msg_send */
 	u32			tx_off;		/* base offset in peer rmb */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index f64589d823aa..c5e33296e55c 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -47,6 +47,9 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 		/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
 		smp_mb__after_atomic();
 		smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
+		smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor,
+			      conn);
+		conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
 	}
 	smc_tx_sndbuf_nonfull(smc);
 	bh_unlock_sock(&smc->sk);
@@ -104,6 +107,9 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 	if (!rc) {
 		smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
 		conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+	} else {
+		conn->tx_cdc_seq--;
+		conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
 	}
 
 	return rc;
-- 
cgit v1.2.3


From c6f02ebeea3a0ff4bddddf0fd82303190ebb3dd1 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:38 +0200
Subject: net/smc: switch connections to alternate link

Add smc_switch_conns() to switch all connections from a link that is
going down. Find an other link to switch the connections to, and
switch each connection to the new link. smc_switch_cursor() updates the
cursors of a connection to the state of the last successfully sent CDC
message. When there is no link to switch to, terminate the link group.
Call smc_switch_conns() when a link is going down.
And with the possibility that links of connections can switch adapt CDC
and TX functions to detect and handle link switches.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c  |  18 +++++++-
 net/smc/smc_cdc.h  |   1 +
 net/smc/smc_core.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_core.h |   2 +
 net/smc/smc_llc.c  |   6 +--
 net/smc/smc_tx.c   |  12 ++++-
 6 files changed, 162 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index c5e33296e55c..3ca986066f32 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -56,11 +56,11 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 }
 
 int smc_cdc_get_free_slot(struct smc_connection *conn,
+			  struct smc_link *link,
 			  struct smc_wr_buf **wr_buf,
 			  struct smc_rdma_wr **wr_rdma_buf,
 			  struct smc_cdc_tx_pend **pend)
 {
-	struct smc_link *link = conn->lnk;
 	int rc;
 
 	rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
@@ -119,13 +119,27 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 {
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
+	struct smc_link *link;
+	bool again = false;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend);
+again:
+	link = conn->lnk;
+	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
 	if (rc)
 		return rc;
 
 	spin_lock_bh(&conn->send_lock);
+	if (link != conn->lnk) {
+		/* link of connection changed, try again one time*/
+		spin_unlock_bh(&conn->send_lock);
+		smc_wr_tx_put_slot(link,
+				   (struct smc_wr_tx_pend_priv *)pend);
+		if (again)
+			return -ENOLINK;
+		again = true;
+		goto again;
+	}
 	rc = smc_cdc_msg_send(conn, wr_buf, pend);
 	spin_unlock_bh(&conn->send_lock);
 	return rc;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 861dc24c588c..42246b4bdcc9 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -304,6 +304,7 @@ struct smc_cdc_tx_pend {
 };
 
 int smc_cdc_get_free_slot(struct smc_connection *conn,
+			  struct smc_link *link,
 			  struct smc_wr_buf **wr_buf,
 			  struct smc_rdma_wr **wr_rdma_buf,
 			  struct smc_cdc_tx_pend **pend);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 32a6cadc5c1f..21bc1ec07e99 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -432,6 +432,135 @@ out:
 	return rc;
 }
 
+static int smc_write_space(struct smc_connection *conn)
+{
+	int buffer_len = conn->peer_rmbe_size;
+	union smc_host_cursor prod;
+	union smc_host_cursor cons;
+	int space;
+
+	smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+	smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+	/* determine rx_buf space */
+	space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
+	return space;
+}
+
+static int smc_switch_cursor(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons, fin;
+	int rc = 0;
+	int diff;
+
+	smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
+	smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
+	/* set prod cursor to old state, enforce tx_rdma_writes() */
+	smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
+	smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+	if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
+		/* cons cursor advanced more than fin, and prod was set
+		 * fin above, so now prod is smaller than cons. Fix that.
+		 */
+		diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
+		smc_curs_add(conn->sndbuf_desc->len,
+			     &conn->tx_curs_sent, diff);
+		smc_curs_add(conn->sndbuf_desc->len,
+			     &conn->tx_curs_fin, diff);
+
+		smp_mb__before_atomic();
+		atomic_add(diff, &conn->sndbuf_space);
+		smp_mb__after_atomic();
+
+		smc_curs_add(conn->peer_rmbe_size,
+			     &conn->local_tx_ctrl.prod, diff);
+		smc_curs_add(conn->peer_rmbe_size,
+			     &conn->local_tx_ctrl_fin, diff);
+	}
+	/* recalculate, value is used by tx_rdma_writes() */
+	atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
+
+	if (smc->sk.sk_state != SMC_INIT &&
+	    smc->sk.sk_state != SMC_CLOSED) {
+		/* tbd: call rc = smc_cdc_get_slot_and_msg_send(conn); */
+		if (!rc) {
+			schedule_delayed_work(&conn->tx_work, 0);
+			smc->sk.sk_data_ready(&smc->sk);
+		}
+	}
+	return rc;
+}
+
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+				  struct smc_link *from_lnk, bool is_dev_err)
+{
+	struct smc_link *to_lnk = NULL;
+	struct smc_connection *conn;
+	struct smc_sock *smc;
+	struct rb_node *node;
+	int i, rc = 0;
+
+	/* link is inactive, wake up tx waiters */
+	smc_wr_wakeup_tx_wait(from_lnk);
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (lgr->lnk[i].state != SMC_LNK_ACTIVE ||
+		    i == from_lnk->link_idx)
+			continue;
+		if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
+		    from_lnk->ibport == lgr->lnk[i].ibport) {
+			continue;
+		}
+		to_lnk = &lgr->lnk[i];
+		break;
+	}
+	if (!to_lnk) {
+		smc_lgr_terminate_sched(lgr);
+		return NULL;
+	}
+again:
+	read_lock_bh(&lgr->conns_lock);
+	for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
+		conn = rb_entry(node, struct smc_connection, alert_node);
+		if (conn->lnk != from_lnk)
+			continue;
+		smc = container_of(conn, struct smc_sock, conn);
+		/* conn->lnk not yet set in SMC_INIT state */
+		if (smc->sk.sk_state == SMC_INIT)
+			continue;
+		if (smc->sk.sk_state == SMC_CLOSED ||
+		    smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
+		    smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
+		    smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
+		    smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
+		    smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
+		    smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
+		    smc->sk.sk_state == SMC_PEERABORTWAIT ||
+		    smc->sk.sk_state == SMC_PROCESSABORT) {
+			spin_lock_bh(&conn->send_lock);
+			conn->lnk = to_lnk;
+			spin_unlock_bh(&conn->send_lock);
+			continue;
+		}
+		sock_hold(&smc->sk);
+		read_unlock_bh(&lgr->conns_lock);
+		/* avoid race with smcr_tx_sndbuf_nonempty() */
+		spin_lock_bh(&conn->send_lock);
+		conn->lnk = to_lnk;
+		rc = smc_switch_cursor(smc);
+		spin_unlock_bh(&conn->send_lock);
+		sock_put(&smc->sk);
+		if (rc) {
+			smcr_link_down_cond_sched(to_lnk);
+			return NULL;
+		}
+		goto again;
+	}
+	read_unlock_bh(&lgr->conns_lock);
+	return to_lnk;
+}
+
 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
 			   struct smc_link_group *lgr)
 {
@@ -943,8 +1072,7 @@ static void smcr_link_down(struct smc_link *lnk)
 		return;
 
 	smc_ib_modify_qp_reset(lnk);
-	to_lnk = NULL;
-	/* tbd: call to_lnk = smc_switch_conns(lgr, lnk, true); */
+	to_lnk = smc_switch_conns(lgr, lnk, true);
 	if (!to_lnk) { /* no backup link available */
 		smcr_link_clear(lnk);
 		return;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 7fe53feb9dc4..584f11230c4f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -380,6 +380,8 @@ void smcr_link_clear(struct smc_link *lnk);
 int smcr_buf_map_lgr(struct smc_link *lnk);
 int smcr_buf_reg_lgr(struct smc_link *lnk);
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+				  struct smc_link *from_lnk, bool is_dev_err);
 void smcr_link_down_cond(struct smc_link *lnk);
 void smcr_link_down_cond_sched(struct smc_link *lnk);
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 7675ccd6f3c3..8d2368accbad 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -933,7 +933,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
 		return; /* no asymmetric link */
 	if (!smc_link_downing(&lnk_asym->state))
 		return;
-	/* tbd: lnk_new = smc_switch_conns(lgr, lnk_asym, false); */
+	lnk_new = smc_switch_conns(lgr, lnk_asym, false);
 	smc_wr_tx_wait_no_pending_sends(lnk_asym);
 	if (!lnk_new)
 		goto out_free;
@@ -1195,7 +1195,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
 	smc_llc_send_message(lnk, &qentry->msg); /* response */
 
 	if (smc_link_downing(&lnk_del->state)) {
-		/* tbd: call smc_switch_conns(lgr, lnk_del, false); */
+		smc_switch_conns(lgr, lnk_del, false);
 		smc_wr_tx_wait_no_pending_sends(lnk_del);
 	}
 	smcr_link_clear(lnk_del);
@@ -1245,7 +1245,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 		goto out; /* asymmetric link already deleted */
 
 	if (smc_link_downing(&lnk_del->state)) {
-		/* tbd: call smc_switch_conns(lgr, lnk_del, false); */
+		smc_switch_conns(lgr, lnk_del, false);
 		smc_wr_tx_wait_no_pending_sends(lnk_del);
 	}
 	if (!list_empty(&lgr->list)) {
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 417204572a69..54ba0443847e 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -482,12 +482,13 @@ static int smc_tx_rdma_writes(struct smc_connection *conn,
 static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
 	struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+	struct smc_link *link = conn->lnk;
 	struct smc_rdma_wr *wr_rdma_buf;
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend);
+	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
 			struct smc_sock *smc =
@@ -505,10 +506,17 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 	}
 
 	spin_lock_bh(&conn->send_lock);
+	if (link != conn->lnk) {
+		/* link of connection changed, tx_work will restart */
+		smc_wr_tx_put_slot(link,
+				   (struct smc_wr_tx_pend_priv *)pend);
+		rc = -ENOLINK;
+		goto out_unlock;
+	}
 	if (!pflags->urg_data_present) {
 		rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
 		if (rc) {
-			smc_wr_tx_put_slot(conn->lnk,
+			smc_wr_tx_put_slot(link,
 					   (struct smc_wr_tx_pend_priv *)pend);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From 29bd73dba4f72970895a2459f7190d388f5204f7 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:39 +0200
Subject: net/smc: send failover validation message

When a connection is switched to a new link then a link validation
message must be sent to the peer over the new link, containing the
sequence number of the last CDC message that was sent over the old link.
The peer will validate if this sequence number is the same or lower then
the number he received, and abort the connection if messages were lost.
Add smcr_cdc_msg_send_validation() to send the message validation
message and call it when a connection was switched in
smc_switch_cursor().

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c  | 25 +++++++++++++++++++++++++
 net/smc/smc_cdc.h  |  1 +
 net/smc/smc_core.c |  2 +-
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 3ca986066f32..e6b7eef71831 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -115,6 +115,31 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 	return rc;
 }
 
+/* send a validation msg indicating the move of a conn to an other QP link */
+int smcr_cdc_msg_send_validation(struct smc_connection *conn)
+{
+	struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+	struct smc_link *link = conn->lnk;
+	struct smc_cdc_tx_pend *pend;
+	struct smc_wr_buf *wr_buf;
+	struct smc_cdc_msg *peer;
+	int rc;
+
+	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
+	if (rc)
+		return rc;
+
+	peer = (struct smc_cdc_msg *)wr_buf;
+	peer->common.type = local->common.type;
+	peer->len = local->len;
+	peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */
+	peer->token = htonl(local->token);
+	peer->prod_flags.failover_validation = 1;
+
+	rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+	return rc;
+}
+
 static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 {
 	struct smc_cdc_tx_pend *pend;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 42246b4bdcc9..9cfabc9af120 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -313,6 +313,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend);
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
 int smcd_cdc_msg_send(struct smc_connection *conn);
+int smcr_cdc_msg_send_validation(struct smc_connection *conn);
 int smc_cdc_init(void) __init;
 void smcd_cdc_rx_init(struct smc_connection *conn);
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 21bc1ec07e99..a558ce0bde97 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -483,7 +483,7 @@ static int smc_switch_cursor(struct smc_sock *smc)
 
 	if (smc->sk.sk_state != SMC_INIT &&
 	    smc->sk.sk_state != SMC_CLOSED) {
-		/* tbd: call rc = smc_cdc_get_slot_and_msg_send(conn); */
+		rc = smcr_cdc_msg_send_validation(conn);
 		if (!rc) {
 			schedule_delayed_work(&conn->tx_work, 0);
 			smc->sk.sk_data_ready(&smc->sk);
-- 
cgit v1.2.3


From b286a0651e4404ab96cdfdcdad8a839a26b3751e Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:40 +0200
Subject: net/smc: handle incoming CDC validation message

Call smc_cdc_msg_validate() when a CDC message with the failover
validation bit enabled was received. Validate that the sequence number
sent with the message is one we already have received. If not, messages
were lost and the connection is terminated using a new abort_work.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc.h      |  2 ++
 net/smc/smc_cdc.c  | 37 +++++++++++++++++++++++++++++++------
 net/smc/smc_core.c | 15 +++++++++++++++
 3 files changed, 48 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc.h b/net/smc/smc.h
index 1e9113771600..6f1c42da7a4c 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -188,12 +188,14 @@ struct smc_connection {
 	spinlock_t		acurs_lock;	/* protect cursors */
 #endif
 	struct work_struct	close_work;	/* peer sent some closing */
+	struct work_struct	abort_work;	/* abort the connection */
 	struct tasklet_struct	rx_tsklet;	/* Receiver tasklet for SMC-D */
 	u8			rx_off;		/* receive offset:
 						 * 0 for SMC-R, 32 for SMC-D
 						 */
 	u64			peer_token;	/* SMC-D token of peer */
 	u8			killed : 1;	/* abnormal termination */
+	u8			out_of_sync : 1; /* out of sync with peer */
 };
 
 struct smc_sock {				/* smc sock container */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index e6b7eef71831..b2b85e1be72c 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -282,6 +282,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
 	sk_send_sigurg(&smc->sk);
 }
 
+static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
+				 struct smc_link *link)
+{
+	struct smc_connection *conn = &smc->conn;
+	u16 recv_seq = ntohs(cdc->seqno);
+	s16 diff;
+
+	/* check that seqnum was seen before */
+	diff = conn->local_rx_ctrl.seqno - recv_seq;
+	if (diff < 0) { /* diff larger than 0x7fff */
+		/* drop connection */
+		conn->out_of_sync = 1;	/* prevent any further receives */
+		spin_lock_bh(&conn->send_lock);
+		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+		conn->lnk = link;
+		spin_unlock_bh(&conn->send_lock);
+		sock_hold(&smc->sk); /* sock_put in abort_work */
+		if (!schedule_work(&conn->abort_work))
+			sock_put(&smc->sk);
+	}
+}
+
 static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 				    struct smc_cdc_msg *cdc)
 {
@@ -412,16 +434,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
 	read_lock_bh(&lgr->conns_lock);
 	conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
 	read_unlock_bh(&lgr->conns_lock);
-	if (!conn)
+	if (!conn || conn->out_of_sync)
 		return;
 	smc = container_of(conn, struct smc_sock, conn);
 
-	if (!cdc->prod_flags.failover_validation) {
-		if (smc_cdc_before(ntohs(cdc->seqno),
-				   conn->local_rx_ctrl.seqno))
-			/* received seqno is old */
-			return;
+	if (cdc->prod_flags.failover_validation) {
+		smc_cdc_msg_validate(smc, cdc, link);
+		return;
 	}
+	if (smc_cdc_before(ntohs(cdc->seqno),
+			   conn->local_rx_ctrl.seqno))
+		/* received seqno is old */
+		return;
+
 	smc_cdc_msg_recv(smc, cdc);
 }
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a558ce0bde97..b5633fa19b6d 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -615,6 +615,8 @@ void smc_conn_free(struct smc_connection *conn)
 		tasklet_kill(&conn->rx_tsklet);
 	} else {
 		smc_cdc_tx_dismiss_slots(conn);
+		if (current_work() != &conn->abort_work)
+			cancel_work_sync(&conn->abort_work);
 	}
 	if (!list_empty(&lgr->list)) {
 		smc_lgr_unregister_conn(conn);
@@ -996,6 +998,18 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 	}
 }
 
+/* abort connection, abort_work scheduled from tasklet context */
+static void smc_conn_abort_work(struct work_struct *work)
+{
+	struct smc_connection *conn = container_of(work,
+						   struct smc_connection,
+						   abort_work);
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+	smc_conn_kill(conn, true);
+	sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */
+}
+
 /* link is up - establish alternate link if applicable */
 static void smcr_link_up(struct smc_link_group *lgr,
 			 struct smc_ib_device *smcibdev, u8 ibport)
@@ -1302,6 +1316,7 @@ create:
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
 	conn->urg_state = SMC_URG_READ;
+	INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
 	if (ini->is_smcd) {
 		conn->rx_off = sizeof(struct smcd_cdc_msg);
 		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
-- 
cgit v1.2.3


From 09c61d24f96dfef7791debfcaf96efe067ab2ba8 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:41 +0200
Subject: net/smc: wait for departure of an IB message

Introduce smc_wr_tx_send_wait() to send an IB message and wait for the
tx completion event of the message. This makes sure that the message is
no longer in-flight when the function returns.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.h |  1 +
 net/smc/smc_wr.c   | 39 +++++++++++++++++++++++++++++++++++++++
 net/smc/smc_wr.h   |  2 ++
 3 files changed, 42 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 584f11230c4f..86eebbadc8f6 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -85,6 +85,7 @@ struct smc_link {
 	struct smc_rdma_sges	*wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
 	struct smc_rdma_wr	*wr_tx_rdmas;	/* WR RDMA WRITE */
 	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
+	struct completion	*wr_tx_compl;	/* WR send CQE completion */
 	/* above four vectors have wr_tx_cnt elements and use the same index */
 	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
 	atomic_long_t		wr_tx_id;	/* seq # of last sent WR */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 3fd27bea4f7a..7239ba9b99dc 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -44,6 +44,7 @@ struct smc_wr_tx_pend {	/* control data for a pending send request */
 	struct smc_link		*link;
 	u32			idx;
 	struct smc_wr_tx_pend_priv priv;
+	u8			compl_requested;
 };
 
 /******************************** send queue *********************************/
@@ -103,6 +104,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 	if (pnd_snd_idx == link->wr_tx_cnt)
 		return;
 	link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+	if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+		complete(&link->wr_tx_compl[pnd_snd_idx]);
 	memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
 	/* clear the full struct smc_wr_tx_pend including .priv */
 	memset(&link->wr_tx_pends[pnd_snd_idx], 0,
@@ -275,6 +278,33 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
 	return rc;
 }
 
+/* Send prepared WR slot via ib_post_send and wait for send completion
+ * notification.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+			unsigned long timeout)
+{
+	struct smc_wr_tx_pend *pend;
+	int rc;
+
+	pend = container_of(priv, struct smc_wr_tx_pend, priv);
+	pend->compl_requested = 1;
+	init_completion(&link->wr_tx_compl[pend->idx]);
+
+	rc = smc_wr_tx_send(link, priv);
+	if (rc)
+		return rc;
+	/* wait for completion by smc_wr_tx_process_cqe() */
+	rc = wait_for_completion_interruptible_timeout(
+					&link->wr_tx_compl[pend->idx], timeout);
+	if (rc <= 0)
+		rc = -ENODATA;
+	if (rc > 0)
+		rc = 0;
+	return rc;
+}
+
 /* Register a memory region and wait for result. */
 int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
 {
@@ -555,6 +585,8 @@ void smc_wr_free_link(struct smc_link *lnk)
 
 void smc_wr_free_link_mem(struct smc_link *lnk)
 {
+	kfree(lnk->wr_tx_compl);
+	lnk->wr_tx_compl = NULL;
 	kfree(lnk->wr_tx_pends);
 	lnk->wr_tx_pends = NULL;
 	kfree(lnk->wr_tx_mask);
@@ -625,8 +657,15 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
 				    GFP_KERNEL);
 	if (!link->wr_tx_pends)
 		goto no_mem_wr_tx_mask;
+	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
+				    sizeof(link->wr_tx_compl[0]),
+				    GFP_KERNEL);
+	if (!link->wr_tx_compl)
+		goto no_mem_wr_tx_pends;
 	return 0;
 
+no_mem_wr_tx_pends:
+	kfree(link->wr_tx_pends);
 no_mem_wr_tx_mask:
 	kfree(link->wr_tx_mask);
 no_mem_wr_rx_sges:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index f7eaeb3391f3..423b8709f1c9 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -101,6 +101,8 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 		       struct smc_wr_tx_pend_priv *wr_pend_priv);
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+			unsigned long timeout);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
 void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
 			     smc_wr_tx_filter filter,
-- 
cgit v1.2.3


From f3811fd7bc97587b142fed9edf8c726694220cb2 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:42 +0200
Subject: net/smc: send DELETE_LINK, ALL message and wait for send to complete

Add smc_llc_send_message_wait() which uses smc_wr_tx_send_wait() to send
an LLC message and waits for the message send to complete.
smc_llc_send_link_delete_all() calls the new function to send an
DELETE_LINK,ALL LLC message. The RFC states that the sender of this type
of message needs to wait for the completion event of the message
transmission and can terminate the link afterwards.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  5 +++++
 net/smc/smc_llc.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_llc.h  |  2 ++
 3 files changed, 51 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index b5633fa19b6d..8f630b76c5a4 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -238,6 +238,9 @@ static void smc_lgr_free_work(struct work_struct *work)
 	spin_unlock_bh(lgr_lock);
 	cancel_delayed_work(&lgr->free_work);
 
+	if (!lgr->is_smcd && !lgr->terminating)
+		smc_llc_send_link_delete_all(lgr, true,
+					     SMC_LLC_DEL_PROG_INIT_TERM);
 	if (lgr->is_smcd && !lgr->terminating)
 		smc_ism_signal_shutdown(lgr);
 	if (!lgr->is_smcd) {
@@ -847,6 +850,8 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
 		put_device(&lgr->smcd->dev);
 	} else {
+		smc_llc_send_link_delete_all(lgr, false,
+					     SMC_LLC_DEL_OP_INIT_TERM);
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			struct smc_link *lnk = &lgr->lnk[i];
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 8d2368accbad..0ea7ad6188ae 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -560,6 +560,25 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 	return smc_wr_tx_send(link, pend);
 }
 
+/* schedule an llc send on link, may wait for buffers,
+ * and wait for send completion notification.
+ * @return 0 on success
+ */
+static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
+{
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	if (!smc_link_usable(link))
+		return -ENOLINK;
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+	return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+}
+
 /********************************* receive ***********************************/
 
 static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
@@ -1215,6 +1234,29 @@ out:
 	kfree(qentry);
 }
 
+/* try to send a DELETE LINK ALL request on any active link,
+ * waiting for send completion
+ */
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
+{
+	struct smc_llc_msg_del_link delllc = {0};
+	int i;
+
+	delllc.hd.common.type = SMC_LLC_DELETE_LINK;
+	delllc.hd.length = sizeof(delllc);
+	if (ord)
+		delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+	delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+	delllc.reason = htonl(rsn);
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (!smc_link_usable(&lgr->lnk[i]))
+			continue;
+		if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
+			break;
+	}
+}
+
 static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 {
 	struct smc_llc_msg_del_link *del_llc;
@@ -1230,6 +1272,8 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 
 	if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
 		/* delete entire lgr */
+		smc_llc_send_link_delete_all(lgr, true, ntohl(
+					      qentry->msg.delete_link.reason));
 		smc_lgr_terminate_sched(lgr);
 		goto out;
 	}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index c335fc5f363c..6d2a5d943b83 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -89,6 +89,8 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 				    int time_out, u8 exp_msg);
 struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
 void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
+				  u32 rsn);
 int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
 int smc_llc_srv_add_link(struct smc_link *link);
 void smc_llc_srv_add_link_local(struct smc_link *link);
-- 
cgit v1.2.3


From 56bc3b2094b428d808dd1704fdb3086c66bcb310 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:43 +0200
Subject: net/smc: assign link to a new connection

For new connections, assign a link from the link group, using some
simple load balancing.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 65 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 8f630b76c5a4..9c19b9aa3719 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -121,16 +121,59 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn)
 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
 }
 
+/* assign an SMC-R link to the connection */
+static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
+{
+	enum smc_link_state expected = first ? SMC_LNK_ACTIVATING :
+				       SMC_LNK_ACTIVE;
+	int i, j;
+
+	/* do link balancing */
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		struct smc_link *lnk = &conn->lgr->lnk[i];
+
+		if (lnk->state != expected)
+			continue;
+		if (conn->lgr->role == SMC_CLNT) {
+			conn->lnk = lnk; /* temporary, SMC server assigns link*/
+			break;
+		}
+		if (conn->lgr->conns_num % 2) {
+			for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+				struct smc_link *lnk2;
+
+				lnk2 = &conn->lgr->lnk[j];
+				if (lnk2->state == expected) {
+					conn->lnk = lnk2;
+					break;
+				}
+			}
+		}
+		if (!conn->lnk)
+			conn->lnk = lnk;
+		break;
+	}
+	if (!conn->lnk)
+		return SMC_CLC_DECL_NOACTLINK;
+	return 0;
+}
+
 /* Register connection in link group by assigning an alert token
  * registered in a search tree.
  * Requires @conns_lock
  * Note that '0' is a reserved value and not assigned.
  */
-static int smc_lgr_register_conn(struct smc_connection *conn)
+static int smc_lgr_register_conn(struct smc_connection *conn, bool first)
 {
 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
 	static atomic_t nexttoken = ATOMIC_INIT(0);
+	int rc;
 
+	if (!conn->lgr->is_smcd) {
+		rc = smcr_lgr_conn_assign_link(conn, first);
+		if (rc)
+			return rc;
+	}
 	/* find a new alert_token_local value not yet used by some connection
 	 * in this link group
 	 */
@@ -141,22 +184,6 @@ static int smc_lgr_register_conn(struct smc_connection *conn)
 			conn->alert_token_local = 0;
 	}
 	smc_lgr_add_alert_token(conn);
-
-	/* assign the new connection to a link */
-	if (!conn->lgr->is_smcd) {
-		struct smc_link *lnk;
-		int i;
-
-		/* tbd - link balancing */
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			lnk = &conn->lgr->lnk[i];
-			if (lnk->state == SMC_LNK_ACTIVATING ||
-			    lnk->state == SMC_LNK_ACTIVE)
-				conn->lnk = lnk;
-		}
-		if (!conn->lnk)
-			return SMC_CLC_DECL_NOACTLINK;
-	}
 	conn->lgr->conns_num++;
 	return 0;
 }
@@ -1285,7 +1312,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 			/* link group found */
 			ini->cln_first_contact = SMC_REUSE_CONTACT;
 			conn->lgr = lgr;
-			rc = smc_lgr_register_conn(conn); /* add conn to lgr */
+			rc = smc_lgr_register_conn(conn, false);
 			write_unlock_bh(&lgr->conns_lock);
 			if (!rc && delayed_work_pending(&lgr->free_work))
 				cancel_delayed_work(&lgr->free_work);
@@ -1313,7 +1340,7 @@ create:
 			goto out;
 		lgr = conn->lgr;
 		write_lock_bh(&lgr->conns_lock);
-		rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */
+		rc = smc_lgr_register_conn(conn, true);
 		write_unlock_bh(&lgr->conns_lock);
 		if (rc)
 			goto out;
-- 
cgit v1.2.3


From ad6c111b8ae760114df6765d5a5ed1b09020d45d Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:44 +0200
Subject: net/smc: asymmetric link tagging

New connections must not be assigned to asymmetric links. Add asymmetric
link tagging using new link variable link_is_asym. The new helpers
smcr_lgr_set_type() and smcr_lgr_set_type_asym() are called to set the
state of the link group, and tag all links accordingly.
smcr_lgr_conn_assign_link() respects the link tagging and will not
assign new connections to links tagged as asymmetric link.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 26 +++++++++++++++++++++++---
 net/smc/smc_core.h |  4 ++++
 net/smc/smc_llc.c  | 20 ++++++++++++++------
 3 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 9c19b9aa3719..be15b30a1234 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -132,7 +132,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 		struct smc_link *lnk = &conn->lgr->lnk[i];
 
-		if (lnk->state != expected)
+		if (lnk->state != expected || lnk->link_is_asym)
 			continue;
 		if (conn->lgr->role == SMC_CLNT) {
 			conn->lnk = lnk; /* temporary, SMC server assigns link*/
@@ -143,7 +143,8 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
 				struct smc_link *lnk2;
 
 				lnk2 = &conn->lgr->lnk[j];
-				if (lnk2->state == expected) {
+				if (lnk2->state == expected &&
+				    !lnk2->link_is_asym) {
 					conn->lnk = lnk2;
 					break;
 				}
@@ -1030,6 +1031,25 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 	}
 }
 
+/* set new lgr type and clear all asymmetric link tagging */
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
+{
+	int i;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+		if (smc_link_usable(&lgr->lnk[i]))
+			lgr->lnk[i].link_is_asym = false;
+	lgr->type = new_type;
+}
+
+/* set new lgr type and tag a link as asymmetric */
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+			    enum smc_lgr_type new_type, int asym_lnk_idx)
+{
+	smcr_lgr_set_type(lgr, new_type);
+	lgr->lnk[asym_lnk_idx].link_is_asym = true;
+}
+
 /* abort connection, abort_work scheduled from tasklet context */
 static void smc_conn_abort_work(struct work_struct *work)
 {
@@ -1123,7 +1143,7 @@ static void smcr_link_down(struct smc_link *lnk)
 		smcr_link_clear(lnk);
 		return;
 	}
-	lgr->type = SMC_LGR_SINGLE;
+	smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
 	del_link_id = lnk->link_id;
 
 	if (lgr->role == SMC_SERV) {
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 86eebbadc8f6..6ed7ab6d89d5 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@ struct smc_link {
 	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
 	u8			link_idx;	/* index in lgr link array */
+	u8			link_is_asym;	/* is link asymmetric? */
 	struct smc_link_group	*lgr;		/* parent link group */
 	struct work_struct	link_down_wrk;	/* wrk to bring link down */
 
@@ -380,6 +381,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 void smcr_link_clear(struct smc_link *lnk);
 int smcr_buf_map_lgr(struct smc_link *lnk);
 int smcr_buf_reg_lgr(struct smc_link *lnk);
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+			    enum smc_lgr_type new_type, int asym_lnk_idx);
 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
 struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
 				  struct smc_link *from_lnk, bool is_dev_err);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 0ea7ad6188ae..f65b2aac6b52 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -796,7 +796,11 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 		return -ENOLINK;
 	}
 	smc_llc_link_active(link_new);
-	lgr->type = lgr_new_t;
+	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+		smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+	else
+		smcr_lgr_set_type(lgr, lgr_new_t);
 	return 0;
 }
 
@@ -1038,7 +1042,11 @@ static int smc_llc_srv_conf_link(struct smc_link *link,
 		return -ENOLINK;
 	}
 	smc_llc_link_active(link_new);
-	lgr->type = lgr_new_t;
+	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+		smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+	else
+		smcr_lgr_set_type(lgr, lgr_new_t);
 	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 	return 0;
 }
@@ -1223,9 +1231,9 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
 	if (lnk_del == lnk_asym) {
 		/* expected deletion of asym link, don't change lgr state */
 	} else if (active_links == 1) {
-		lgr->type = SMC_LGR_SINGLE;
+		smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
 	} else if (!active_links) {
-		lgr->type = SMC_LGR_NONE;
+		smcr_lgr_set_type(lgr, SMC_LGR_NONE);
 		smc_lgr_terminate_sched(lgr);
 	}
 out_unlock:
@@ -1314,9 +1322,9 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 
 	active_links = smc_llc_active_link_count(lgr);
 	if (active_links == 1) {
-		lgr->type = SMC_LGR_SINGLE;
+		smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
 	} else if (!active_links) {
-		lgr->type = SMC_LGR_NONE;
+		smcr_lgr_set_type(lgr, SMC_LGR_NONE);
 		smc_lgr_terminate_sched(lgr);
 	}
 
-- 
cgit v1.2.3


From 3e0c40afce4ea5b08bb7e3f65c55157817116640 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:45 +0200
Subject: net/smc: add termination reason and handle LLC protocol violation

Allow to set the reason code for the link group termination, and set
meaningful values before termination processing is triggered. This
reason code is sent to the peer in the final delete link message.
When the LLC request or response layer receives a message type that was
not handled, drop a warning and terminate the link group.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  8 ++++++--
 net/smc/smc_core.h |  2 ++
 net/smc/smc_llc.c  | 14 ++++++++++++++
 net/smc/smc_llc.h  |  8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index be15b30a1234..b6f93b44f9c7 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -878,8 +878,11 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
 		put_device(&lgr->smcd->dev);
 	} else {
-		smc_llc_send_link_delete_all(lgr, false,
-					     SMC_LLC_DEL_OP_INIT_TERM);
+		u32 rsn = lgr->llc_termination_rsn;
+
+		if (!rsn)
+			rsn = SMC_LLC_DEL_PROG_INIT_TERM;
+		smc_llc_send_link_delete_all(lgr, false, rsn);
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			struct smc_link *lnk = &lgr->lnk[i];
 
@@ -1018,6 +1021,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 
 	list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
 		list_del_init(&lgr->list);
+		smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM);
 		__smc_lgr_terminate(lgr, false);
 	}
 
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 6ed7ab6d89d5..32bc45af9a1a 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -271,6 +271,8 @@ struct smc_link_group {
 						/* protects llc flow */
 			int			llc_testlink_time;
 						/* link keep alive time */
+			u32			llc_termination_rsn;
+						/* rsn code for termination */
 		};
 		struct { /* SMC-D */
 			u64			peer_gid;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index f65b2aac6b52..482acf80e26e 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -1420,6 +1420,14 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
 	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
 }
 
+static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
+{
+	pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: "
+			    "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type);
+	smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
+	smc_lgr_terminate_sched(lgr);
+}
+
 /* flush the llc event queue */
 static void smc_llc_event_flush(struct smc_link_group *lgr)
 {
@@ -1520,6 +1528,9 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 			smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
 		}
 		return;
+	default:
+		smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
+		break;
 	}
 out:
 	kfree(qentry);
@@ -1579,6 +1590,9 @@ static void smc_llc_rx_response(struct smc_link *link,
 	case SMC_LLC_CONFIRM_RKEY_CONT:
 		/* not used because max links is 3 */
 		break;
+	default:
+		smc_llc_protocol_violation(link->lgr, llc_type);
+		break;
 	}
 	kfree(qentry);
 }
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 6d2a5d943b83..f5882ebf357b 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -60,6 +60,14 @@ static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
 	return NULL;
 }
 
+/* set the termination reason code for the link group */
+static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr,
+					       u32 rsn)
+{
+	if (!lgr->llc_termination_rsn)
+		lgr->llc_termination_rsn = rsn;
+}
+
 /* transmit */
 int smc_llc_send_confirm_link(struct smc_link *lnk,
 			      enum smc_llc_reqresp reqresp);
-- 
cgit v1.2.3


From a52bcc919b14c9d78f03b2b4ff604e5ca69c7e6d Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:46 +0200
Subject: net/smc: improve termination processing

Add helper smcr_lgr_link_deactivate_all() and eliminate duplicate code.
In smc_lgr_free(), clear the smc-r links before smc_lgr_free_bufs() is
called so buffers are already prepared for free. The usage of the soft
parameter in __smc_lgr_terminate() is no longer needed, smc_lgr_free()
can be called directly. smc_lgr_terminate_sched() and
smc_smcd_terminate() set lgr->freeing to indicate that the link group
will be freed soon to avoid unnecessary schedules of the free worker.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 61 +++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index b6f93b44f9c7..fb391bc6781e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -237,6 +237,19 @@ void smc_lgr_cleanup_early(struct smc_connection *conn)
 	smc_lgr_schedule_free_work_fast(lgr);
 }
 
+static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
+{
+	int i;
+
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		struct smc_link *lnk = &lgr->lnk[i];
+
+		if (smc_link_usable(lnk))
+			lnk->state = SMC_LNK_INACTIVE;
+	}
+	wake_up_interruptible_all(&lgr->llc_waiter);
+}
+
 static void smc_lgr_free(struct smc_link_group *lgr);
 
 static void smc_lgr_free_work(struct work_struct *work)
@@ -246,7 +259,6 @@ static void smc_lgr_free_work(struct work_struct *work)
 						  free_work);
 	spinlock_t *lgr_lock;
 	bool conns;
-	int i;
 
 	smc_lgr_list_head(lgr, &lgr_lock);
 	spin_lock_bh(lgr_lock);
@@ -271,15 +283,8 @@ static void smc_lgr_free_work(struct work_struct *work)
 					     SMC_LLC_DEL_PROG_INIT_TERM);
 	if (lgr->is_smcd && !lgr->terminating)
 		smc_ism_signal_shutdown(lgr);
-	if (!lgr->is_smcd) {
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			struct smc_link *lnk = &lgr->lnk[i];
-
-			if (smc_link_usable(lnk))
-				lnk->state = SMC_LNK_INACTIVE;
-		}
-		wake_up_interruptible_all(&lgr->llc_waiter);
-	}
+	if (!lgr->is_smcd)
+		smcr_lgr_link_deactivate_all(lgr);
 	smc_lgr_free(lgr);
 }
 
@@ -802,6 +807,16 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 {
 	int i;
 
+	if (!lgr->is_smcd) {
+		mutex_lock(&lgr->llc_conf_mutex);
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
+				smcr_link_clear(&lgr->lnk[i]);
+		}
+		mutex_unlock(&lgr->llc_conf_mutex);
+		smc_llc_lgr_clear(lgr);
+	}
+
 	smc_lgr_free_bufs(lgr);
 	if (lgr->is_smcd) {
 		if (!lgr->terminating) {
@@ -811,11 +826,6 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
 			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
-				smcr_link_clear(&lgr->lnk[i]);
-		}
-		smc_llc_lgr_clear(lgr);
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
@@ -870,8 +880,6 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
 
 static void smc_lgr_cleanup(struct smc_link_group *lgr)
 {
-	int i;
-
 	if (lgr->is_smcd) {
 		smc_ism_signal_shutdown(lgr);
 		smcd_unregister_all_dmbs(lgr);
@@ -883,13 +891,7 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
 		if (!rsn)
 			rsn = SMC_LLC_DEL_PROG_INIT_TERM;
 		smc_llc_send_link_delete_all(lgr, false, rsn);
-		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
-			struct smc_link *lnk = &lgr->lnk[i];
-
-			if (smc_link_usable(lnk))
-				lnk->state = SMC_LNK_INACTIVE;
-		}
-		wake_up_interruptible_all(&lgr->llc_waiter);
+		smcr_lgr_link_deactivate_all(lgr);
 	}
 }
 
@@ -905,8 +907,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
 
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
-	if (!soft)
-		cancel_delayed_work_sync(&lgr->free_work);
+	/* cancel free_work sync, will terminate when lgr->freeing is set */
+	cancel_delayed_work_sync(&lgr->free_work);
 	lgr->terminating = 1;
 
 	/* kill remaining link group connections */
@@ -926,10 +928,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
 	}
 	read_unlock_bh(&lgr->conns_lock);
 	smc_lgr_cleanup(lgr);
-	if (soft)
-		smc_lgr_schedule_free_work_fast(lgr);
-	else
-		smc_lgr_free(lgr);
+	smc_lgr_free(lgr);
 }
 
 /* unlink link group and schedule termination */
@@ -944,6 +943,7 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr)
 		return;	/* lgr already terminating */
 	}
 	list_del_init(&lgr->list);
+	lgr->freeing = 1;
 	spin_unlock_bh(lgr_lock);
 	schedule_work(&lgr->terminate_work);
 }
@@ -962,6 +962,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
 			if (peer_gid) /* peer triggered termination */
 				lgr->peer_shutdown = 1;
 			list_move(&lgr->list, &lgr_free_list);
+			lgr->freeing = 1;
 		}
 	}
 	spin_unlock_bh(&dev->lgr_lock);
-- 
cgit v1.2.3


From 45fa8da0bf5cb447fcf835d184e2d3b745376e69 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:47 +0200
Subject: net/smc: create improved SMC-R link_uid

The link_uid of an SMC-R link is exchanged between SMC peers and its
value can be used for debugging purposes. Create a unique link_uid
during link initialization and use it in communication with SMC-R peers.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  1 +
 net/smc/smc_core.h |  4 +++-
 net/smc/smc_llc.c  | 18 ++++++++++++++----
 net/smc/smc_llc.h  |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index fb391bc6781e..fb5f685ff494 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -331,6 +331,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->smcibdev = ini->ib_dev;
 	lnk->ibport = ini->ib_port;
 	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+	smc_llc_link_set_uid(lnk);
 	INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
 	if (!ini->ib_dev->initialized) {
 		rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 32bc45af9a1a..e2ace20db7fd 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -70,6 +70,8 @@ struct smc_rdma_wr {				/* work requests per message
 	struct ib_rdma_wr	wr_tx_rdma[SMC_MAX_RDMA_WRITES];
 };
 
+#define SMC_LGR_ID_SIZE		4
+
 struct smc_link {
 	struct smc_ib_device	*smcibdev;	/* ib-device */
 	u8			ibport;		/* port - values 1 | 2 */
@@ -116,6 +118,7 @@ struct smc_link {
 	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
 	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
+	u8			link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	struct smc_link_group	*lgr;		/* parent link group */
@@ -178,7 +181,6 @@ struct smc_rtoken {				/* address/key of remote RMB */
 	u32			rkey;
 };
 
-#define SMC_LGR_ID_SIZE		4
 #define SMC_BUF_MIN_SIZE	16384	/* minimum size of an RMB */
 #define SMC_RMBE_SIZES		16	/* number of distinct RMBE sizes */
 /* theoretically, the RFC states that largest size would be 512K,
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 482acf80e26e..afb889d60881 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -361,7 +361,6 @@ static int smc_llc_add_pending_send(struct smc_link *link,
 int smc_llc_send_confirm_link(struct smc_link *link,
 			      enum smc_llc_reqresp reqresp)
 {
-	struct smc_link_group *lgr = smc_get_lgr(link);
 	struct smc_llc_msg_confirm_link *confllc;
 	struct smc_wr_tx_pend_priv *pend;
 	struct smc_wr_buf *wr_buf;
@@ -382,7 +381,7 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 	memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
 	hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
 	confllc->link_num = link->link_id;
-	memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
+	memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE);
 	confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
@@ -845,7 +844,8 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	if (rc)
 		goto out_reject;
 	smc_llc_save_add_link_info(lnk_new, llc);
-	lnk_new->link_id = llc->link_num;
+	lnk_new->link_id = llc->link_num;	/* SMC server assigns link id */
+	smc_llc_link_set_uid(lnk_new);
 
 	rc = smc_ib_ready_link(lnk_new);
 	if (rc)
@@ -1775,12 +1775,22 @@ out:
 	return rc;
 }
 
+void smc_llc_link_set_uid(struct smc_link *link)
+{
+	__be32 link_uid;
+
+	link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id);
+	memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
+}
+
 /* evaluate confirm link request or response */
 int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
 			   enum smc_llc_reqresp type)
 {
-	if (type == SMC_LLC_REQ)	/* SMC server assigns link_id */
+	if (type == SMC_LLC_REQ) {	/* SMC server assigns link_id */
 		qentry->link->link_id = qentry->msg.confirm_link.link_num;
+		smc_llc_link_set_uid(qentry->link);
+	}
 	if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
 		return -ENOTSUPP;
 	return 0;
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index f5882ebf357b..1b68f229cb99 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -92,6 +92,7 @@ int smc_llc_flow_initiate(struct smc_link_group *lgr,
 void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
 int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
 			   enum smc_llc_reqresp type);
+void smc_llc_link_set_uid(struct smc_link *link);
 struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 				    struct smc_link *lnk,
 				    int time_out, u8 exp_msg);
-- 
cgit v1.2.3


From 649758fff327eeb184713db8b0b0ebfa28693077 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 4 May 2020 14:18:48 +0200
Subject: net/smc: save SMC-R peer link_uid

During SMC-R link establishment the peers exchange the link_uid that
is used for debugging purposes. Save the peer link_uid in smc_link so it
can be retrieved by the smc_diag netlink interface.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 2 ++
 net/smc/smc_core.h | 1 +
 net/smc/smc_llc.c  | 9 +++++++++
 net/smc/smc_llc.h  | 1 +
 4 files changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c67272007f41..4e4421c95ca1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -390,6 +390,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
+	smc_llc_save_peer_uid(qentry);
 	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
 	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
 	if (rc)
@@ -1056,6 +1057,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
+	smc_llc_save_peer_uid(qentry);
 	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
 	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
 	if (rc)
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index e2ace20db7fd..4ae76802214f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -119,6 +119,7 @@ struct smc_link {
 	u8			peer_gid[SMC_GID_SIZE];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
 	u8			link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
+	u8			peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	struct smc_link_group	*lgr;		/* parent link group */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index afb889d60881..66ddc9cf5e2f 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -770,6 +770,7 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 		return -ENOLINK;
 	}
+	smc_llc_save_peer_uid(qentry);
 	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 
 	rc = smc_ib_modify_qp_rts(link_new);
@@ -1041,6 +1042,7 @@ static int smc_llc_srv_conf_link(struct smc_link *link,
 					 false, SMC_LLC_DEL_LOST_PATH);
 		return -ENOLINK;
 	}
+	smc_llc_save_peer_uid(qentry);
 	smc_llc_link_active(link_new);
 	if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
 	    lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
@@ -1783,6 +1785,13 @@ void smc_llc_link_set_uid(struct smc_link *link)
 	memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
 }
 
+/* save peers link user id, used for debug purposes */
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry)
+{
+	memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid,
+	       SMC_LGR_ID_SIZE);
+}
+
 /* evaluate confirm link request or response */
 int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
 			   enum smc_llc_reqresp type)
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 1b68f229cb99..55287376112d 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -93,6 +93,7 @@ void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
 int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
 			   enum smc_llc_reqresp type);
 void smc_llc_link_set_uid(struct smc_link *link);
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry);
 struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 				    struct smc_link *lnk,
 				    int time_out, u8 exp_msg);
-- 
cgit v1.2.3


From bf6dba76d278d296b385b436d3ac7de56c190d44 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 30 Apr 2020 13:42:22 +0200
Subject: net: sched: fallback to qdisc noqueue if default qdisc setup fail

Currently if the default qdisc setup/init fails, the device ends up with
qdisc "noop", which causes all TX packets to get dropped.

With the introduction of sysctl net/core/default_qdisc it is possible
to change the default qdisc to be more advanced, which opens for the
possibility that Qdisc_ops->init() can fail.

This patch detect these kind of failures, and choose to fallback to
qdisc "noqueue", which is so simple that its init call will not fail.
This allows the interface to continue functioning.

V2:
As this also captures memory failures, which are transient, the
device is not kept in IFF_NO_QUEUE state.  This allows the net_device
to retry to default qdisc assignment.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2efd5b61acef..ad24fa1a51e6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1037,10 +1037,9 @@ static void attach_one_default_qdisc(struct net_device *dev,
 		ops = &pfifo_fast_ops;
 
 	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
-	if (!qdisc) {
-		netdev_info(dev, "activation failed\n");
+	if (!qdisc)
 		return;
-	}
+
 	if (!netif_is_multiqueue(dev))
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	dev_queue->qdisc_sleeping = qdisc;
@@ -1065,6 +1064,18 @@ static void attach_default_qdiscs(struct net_device *dev)
 			qdisc->ops->attach(qdisc);
 		}
 	}
+
+	/* Detect default qdisc setup/init failed and fallback to "noqueue" */
+	if (dev->qdisc == &noop_qdisc) {
+		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
+			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
+		dev->priv_flags |= IFF_NO_QUEUE;
+		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+		dev->qdisc = txq->qdisc_sleeping;
+		qdisc_refcount_inc(dev->qdisc);
+		dev->priv_flags ^= IFF_NO_QUEUE;
+	}
+
 #ifdef CONFIG_NET_SCHED
 	if (dev->qdisc != &noop_qdisc)
 		qdisc_hash_add(dev->qdisc, false);
-- 
cgit v1.2.3


From 39d010504e6b4485d7ceee167743620dd33f4417 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 May 2020 07:07:41 -0700
Subject: net_sched: sch_fq: add horizon attribute

QUIC servers would like to use SO_TXTIME, without having CAP_NET_ADMIN,
to efficiently pace UDP packets.

As far as sch_fq is concerned, we need to add safety checks, so
that a buggy application does not fill the qdisc with packets
having delivery time far in the future.

This patch adds a configurable horizon (default: 10 seconds),
and a configurable policy when a packet is beyond the horizon
at enqueue() time:
- either drop the packet (default policy)
- or cap its delivery time to the horizon.

$ tc -s -d qd sh dev eth0
qdisc fq 8022: root refcnt 257 limit 10000p flow_limit 100p buckets 1024
 orphan_mask 1023 quantum 10Kb initial_quantum 51160b low_rate_threshold 550Kbit
 refill_delay 40.0ms timer_slack 10.000us horizon 10.000s
 Sent 1234215879 bytes 837099 pkt (dropped 21, overlimits 0 requeues 6)
 backlog 0b 0p requeues 6
  flows 1191 (inactive 1177 throttled 0)
  gc 0 highprio 0 throttled 692 latency 11.480us
  pkts_too_long 0 alloc_errors 0 horizon_drops 21 horizon_caps 0

v2: fixed an overflow on 32bit kernels in fq_init(), reported
    by kbuild test robot <lkp@intel.com>

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  6 +++++
 net/sched/sch_fq.c             | 59 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 0c02737c8f47..a95f3ae7ab37 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -913,6 +913,10 @@ enum {
 
 	TCA_FQ_TIMER_SLACK,	/* timer slack */
 
+	TCA_FQ_HORIZON,		/* time horizon in us */
+
+	TCA_FQ_HORIZON_DROP,	/* drop packets beyond horizon, or cap their EDT */
+
 	__TCA_FQ_MAX
 };
 
@@ -932,6 +936,8 @@ struct tc_fq_qd_stats {
 	__u32	throttled_flows;
 	__u32	unthrottle_latency_ns;
 	__u64	ce_mark;		/* packets above ce_threshold */
+	__u64	horizon_drops;
+	__u64	horizon_caps;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4f0104243cc2..8f06a808c59a 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -100,6 +100,7 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
+	u64		ktime_cache;	/* copy of last ktime_get_ns() */
 	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
@@ -109,12 +110,13 @@ struct fq_sched_data {
 	u32		flow_plimit;	/* max packets per flow */
 	unsigned long	flow_max_rate;	/* optional max rate per flow */
 	u64		ce_threshold;
+	u64		horizon;	/* horizon in ns */
 	u32		orphan_mask;	/* mask for orphaned skb */
 	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
-
+	u8		horizon_drop;
 	u32		flows;
 	u32		inactive_flows;
 	u32		throttled_flows;
@@ -123,6 +125,8 @@ struct fq_sched_data {
 	u64		stat_internal_packets;
 	u64		stat_throttled;
 	u64		stat_ce_mark;
+	u64		stat_horizon_drops;
+	u64		stat_horizon_caps;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
@@ -402,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 	struct rb_node **p, *parent;
 	struct sk_buff *head, *aux;
 
-	fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
-
 	head = flow->head;
 	if (!head ||
 	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
@@ -431,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 	rb_insert_color(&skb->rbnode, &flow->t_root);
 }
 
+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
+				    const struct fq_sched_data *q)
+{
+	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
+}
+
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		      struct sk_buff **to_free)
 {
@@ -440,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
+	if (!skb->tstamp) {
+		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+	} else {
+		/* Check if packet timestamp is too far in the future.
+		 * Try first if our cached value, to avoid ktime_get_ns()
+		 * cost in most cases.
+		 */
+		if (fq_packet_beyond_horizon(skb, q)) {
+			/* Refresh our cache and check another time */
+			q->ktime_cache = ktime_get_ns();
+			if (fq_packet_beyond_horizon(skb, q)) {
+				if (q->horizon_drop) {
+					q->stat_horizon_drops++;
+					return qdisc_drop(skb, sch, to_free);
+				}
+				q->stat_horizon_caps++;
+				skb->tstamp = q->ktime_cache + q->horizon;
+			}
+		}
+		fq_skb_cb(skb)->time_to_send = skb->tstamp;
+	}
+
 	f = fq_classify(skb, q);
 	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
 		q->stat_flows_plimit++;
@@ -512,7 +542,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		goto out;
 	}
 
-	now = ktime_get_ns();
+	q->ktime_cache = now = ktime_get_ns();
 	fq_check_throttled(q, now);
 begin:
 	head = &q->new_flows;
@@ -765,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
+	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
+	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -854,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_FQ_TIMER_SLACK])
 		q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
 
+	if (tb[TCA_FQ_HORIZON])
+		q->horizon = (u64)NSEC_PER_USEC *
+				  nla_get_u32(tb[TCA_FQ_HORIZON]);
+
+	if (tb[TCA_FQ_HORIZON_DROP])
+		q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
+
 	if (!err) {
+
 		sch_tree_unlock(sch);
 		err = fq_resize(sch, fq_log);
 		sch_tree_lock(sch);
@@ -907,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 
 	q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
 
+	q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
+	q->horizon_drop = 1; /* by default, drop packets beyond horizon */
+
 	/* Default ce_threshold of 4294 seconds */
 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
 
@@ -924,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	u64 ce_threshold = q->ce_threshold;
+	u64 horizon = q->horizon;
 	struct nlattr *opts;
 
 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
@@ -933,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
 
 	do_div(ce_threshold, NSEC_PER_USEC);
+	do_div(horizon, NSEC_PER_USEC);
 
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
@@ -948,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 			q->low_rate_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
-	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack))
+	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
+	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
+	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
 		goto nla_put_failure;
 
 	return nla_nest_end(skb, opts);
@@ -979,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.unthrottle_latency_ns  = min_t(unsigned long,
 					  q->unthrottle_latency_ns, ~0U);
 	st.ce_mark		  = q->stat_ce_mark;
+	st.horizon_drops	  = q->stat_horizon_drops;
+	st.horizon_caps		  = q->stat_horizon_caps;
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3


From dd86fec7e06ab792fe470c66a67ff42bf5d72b91 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 1 May 2020 09:40:40 -0700
Subject: devlink: factor out building a snapshot notification

We'll need to send snapshot info back on the socket
which requested a snapshot to be created. Factor out
constructing a snapshot description from the broadcast
notification code.

v3: new patch

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 80f97722f31f..2b7c60c18b99 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3716,24 +3716,26 @@ nla_put_failure:
 	return err;
 }
 
-static void devlink_nl_region_notify(struct devlink_region *region,
-				     struct devlink_snapshot *snapshot,
-				     enum devlink_command cmd)
+static struct sk_buff *
+devlink_nl_region_notify_build(struct devlink_region *region,
+			       struct devlink_snapshot *snapshot,
+			       enum devlink_command cmd, u32 portid, u32 seq)
 {
 	struct devlink *devlink = region->devlink;
 	struct sk_buff *msg;
 	void *hdr;
 	int err;
 
-	WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
-		return;
+		return ERR_PTR(-ENOMEM);
 
-	hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
-	if (!hdr)
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
+	if (!hdr) {
+		err = -EMSGSIZE;
 		goto out_free_msg;
+	}
 
 	err = devlink_nl_put_handle(msg, devlink);
 	if (err)
@@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region,
 	}
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
-				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-
-	return;
+	return msg;
 
 out_cancel_msg:
 	genlmsg_cancel(msg, hdr);
 out_free_msg:
 	nlmsg_free(msg);
+	return ERR_PTR(err);
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+				     struct devlink_snapshot *snapshot,
+				     enum devlink_command cmd)
+{
+	struct devlink *devlink = region->devlink;
+	struct sk_buff *msg;
+
+	WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+	msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
+	if (IS_ERR(msg))
+		return;
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
 /**
-- 
cgit v1.2.3


From 043b3e22768d5d909cb1474fc21ae2fbaf026c0c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 1 May 2020 09:40:41 -0700
Subject: devlink: let kernel allocate region snapshot id

Currently users have to choose a free snapshot id before
calling DEVLINK_CMD_REGION_NEW. This is potentially racy
and inconvenient.

Make the DEVLINK_ATTR_REGION_SNAPSHOT_ID optional and try
to allocate id automatically. Send a message back to the
caller with the snapshot info.

Example use:
$ devlink region new netdevsim/netdevsim1/dummy
netdevsim/netdevsim1/dummy: snapshot 1

$ id=$(devlink -j region new netdevsim/netdevsim1/dummy | \
       jq '.[][][][]')
$ devlink region dump netdevsim/netdevsim1/dummy snapshot $id
[...]
$ devlink region del netdevsim/netdevsim1/dummy snapshot $id

v4:
 - inline the notification code
v3:
 - send the notification only once snapshot creation completed.
v2:
 - don't wrap the line containing extack;
 - add a few sentences to the docs.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../networking/devlink/devlink-region.rst          |  7 ++-
 net/core/devlink.c                                 | 57 +++++++++++++++++-----
 .../selftests/drivers/net/netdevsim/devlink.sh     | 13 +++++
 3 files changed, 62 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst
index 04e04d1ff627..daf35427fce1 100644
--- a/Documentation/networking/devlink/devlink-region.rst
+++ b/Documentation/networking/devlink/devlink-region.rst
@@ -23,7 +23,9 @@ states, but see also :doc:`devlink-health`
 Regions may optionally support capturing a snapshot on demand via the
 ``DEVLINK_CMD_REGION_NEW`` netlink message. A driver wishing to allow
 requested snapshots must implement the ``.snapshot`` callback for the region
-in its ``devlink_region_ops`` structure.
+in its ``devlink_region_ops`` structure. If snapshot id is not set in
+the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send
+the snapshot information to user space.
 
 example usage
 -------------
@@ -45,7 +47,8 @@ example usage
     $ devlink region del pci/0000:00:05.0/cr-space snapshot 1
 
     # Request an immediate snapshot, if supported by the region
-    $ devlink region new pci/0000:00:05.0/cr-space snapshot 5
+    $ devlink region new pci/0000:00:05.0/cr-space
+    pci/0000:00:05.0/cr-space: snapshot 5
 
     # Dump a snapshot:
     $ devlink region dump pci/0000:00:05.0/fw-health snapshot 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2b7c60c18b99..43a9d5be73ca 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4086,6 +4086,8 @@ static int
 devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_snapshot *snapshot;
+	struct nlattr *snapshot_id_attr;
 	struct devlink_region *region;
 	const char *region_name;
 	u32 snapshot_id;
@@ -4097,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
-	if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
-		NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided");
-		return -EINVAL;
-	}
-
 	region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
 	region = devlink_region_get_by_name(devlink, region_name);
 	if (!region) {
@@ -4119,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
 		return -ENOSPC;
 	}
 
-	snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+	snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+	if (snapshot_id_attr) {
+		snapshot_id = nla_get_u32(snapshot_id_attr);
 
-	if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
-		NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
-		return -EEXIST;
-	}
+		if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+			NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+			return -EEXIST;
+		}
 
-	err = __devlink_snapshot_id_insert(devlink, snapshot_id);
-	if (err)
-		return err;
+		err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+		if (err)
+			return err;
+	} else {
+		err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id");
+			return err;
+		}
+	}
 
 	err = region->ops->snapshot(devlink, info->extack, &data);
 	if (err)
@@ -4138,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto err_snapshot_create;
 
+	if (!snapshot_id_attr) {
+		struct sk_buff *msg;
+
+		snapshot = devlink_region_snapshot_get_by_id(region,
+							     snapshot_id);
+		if (WARN_ON(!snapshot))
+			return -EINVAL;
+
+		msg = devlink_nl_region_notify_build(region, snapshot,
+						     DEVLINK_CMD_REGION_NEW,
+						     info->snd_portid,
+						     info->snd_seq);
+		err = PTR_ERR_OR_ZERO(msg);
+		if (err)
+			goto err_notify;
+
+		err = genlmsg_reply(msg, info);
+		if (err)
+			goto err_notify;
+	}
+
 	return 0;
 
 err_snapshot_create:
@@ -4145,6 +4172,10 @@ err_snapshot_create:
 err_snapshot_capture:
 	__devlink_snapshot_id_decrement(devlink, snapshot_id);
 	return err;
+
+err_notify:
+	devlink_region_snapshot_del(region, snapshot);
+	return err;
 }
 
 static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index 9f9741444549..ad539eccddcb 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -151,6 +151,19 @@ regions_test()
 
 	check_region_snapshot_count dummy post-second-delete 2
 
+	sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]')
+	check_err $? "Failed to create a new snapshot with id allocated by the kernel"
+
+	check_region_snapshot_count dummy post-first-request 3
+
+	devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null
+	check_err $? "Failed to dump a snapshot with id allocated by the kernel"
+
+	devlink region del $DL_HANDLE/dummy snapshot $sid
+	check_err $? "Failed to delete snapshot with id allocated by the kernel"
+
+	check_region_snapshot_count dummy post-first-request 2
+
 	log_test "regions test"
 }
 
-- 
cgit v1.2.3


From 1a33e10e4a95cb109ff1145098175df3113313ef Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 2 May 2020 22:22:19 -0700
Subject: net: partially revert dynamic lockdep key changes

This patch reverts the folowing commits:

commit 064ff66e2bef84f1153087612032b5b9eab005bd
"bonding: add missing netdev_update_lockdep_key()"

commit 53d374979ef147ab51f5d632dfe20b14aebeccd0
"net: avoid updating qdisc_xmit_lock_key in netdev_update_lockdep_key()"

commit 1f26c0d3d24125992ab0026b0dab16c08df947c7
"net: fix kernel-doc warning in <linux/netdevice.h>"

commit ab92d68fc22f9afab480153bd82a20f6e2533769
"net: core: add generic lockdep keys"

but keeps the addr_list_lock_key because we still lock
addr_list_lock nestedly on stack devices, unlikely xmit_lock
this is safe because we don't take addr_list_lock on any fast
path.

Reported-and-tested-by: syzbot+aaa6fa4949cc5d9b7b25@syzkaller.appspotmail.com
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                   |  1 +
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 16 ++++
 drivers/net/hamradio/bpqether.c                   | 20 +++++
 drivers/net/hyperv/netvsc_drv.c                   |  2 +
 drivers/net/ipvlan/ipvlan_main.c                  |  2 +
 drivers/net/macsec.c                              |  2 +
 drivers/net/macvlan.c                             |  2 +
 drivers/net/ppp/ppp_generic.c                     |  2 +
 drivers/net/team/team.c                           |  1 +
 drivers/net/vrf.c                                 |  1 +
 drivers/net/wireless/intersil/hostap/hostap_hw.c  | 22 ++++++
 include/linux/netdevice.h                         | 27 +++++--
 net/8021q/vlan_dev.c                              | 21 ++++++
 net/batman-adv/soft-interface.c                   | 30 ++++++++
 net/bluetooth/6lowpan.c                           |  8 ++
 net/core/dev.c                                    | 90 ++++++++++++++++++-----
 net/dsa/slave.c                                   | 12 +++
 net/ieee802154/6lowpan/core.c                     |  8 ++
 net/l2tp/l2tp_eth.c                               |  1 +
 net/netrom/af_netrom.c                            | 21 ++++++
 net/rose/af_rose.c                                | 21 ++++++
 net/sched/sch_generic.c                           | 17 +++--
 22 files changed, 294 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2e70e43c5df5..d01871321d22 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4898,6 +4898,7 @@ static int bond_init(struct net_device *bond_dev)
 	spin_lock_init(&bond->stats_lock);
 	lockdep_register_key(&bond->stats_lock_key);
 	lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key);
+	netdev_lockdep_set_classes(bond_dev);
 
 	list_add_tail(&bond->bond_list, &bn->dev_list);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 79d72c88bbef..b3cabc274121 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -299,6 +299,20 @@ static void nfp_repr_clean(struct nfp_repr *repr)
 	nfp_port_free(repr->port);
 }
 
+static struct lock_class_key nfp_repr_netdev_xmit_lock_key;
+
+static void nfp_repr_set_lockdep_class_one(struct net_device *dev,
+					   struct netdev_queue *txq,
+					   void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &nfp_repr_netdev_xmit_lock_key);
+}
+
+static void nfp_repr_set_lockdep_class(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, nfp_repr_set_lockdep_class_one, NULL);
+}
+
 int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 		  u32 cmsg_port_id, struct nfp_port *port,
 		  struct net_device *pf_netdev)
@@ -308,6 +322,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 	u32 repr_cap = nn->tlv_caps.repr_cap;
 	int err;
 
+	nfp_repr_set_lockdep_class(netdev);
+
 	repr->port = port;
 	repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL);
 	if (!repr->dst)
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index fbea6f232819..206688154fdf 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -107,6 +107,25 @@ struct bpqdev {
 
 static LIST_HEAD(bpq_devices);
 
+/*
+ * bpqether network devices are paired with ethernet devices below them, so
+ * form a special "super class" of normal ethernet devices; split their locks
+ * off into a separate class since they always nest.
+ */
+static struct lock_class_key bpq_netdev_xmit_lock_key;
+
+static void bpq_set_lockdep_class_one(struct net_device *dev,
+				      struct netdev_queue *txq,
+				      void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &bpq_netdev_xmit_lock_key);
+}
+
+static void bpq_set_lockdep_class(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL);
+}
+
 /* ------------------------------------------------------------------------ */
 
 
@@ -477,6 +496,7 @@ static int bpq_new_device(struct net_device *edev)
 	err = register_netdevice(ndev);
 	if (err)
 		goto error;
+	bpq_set_lockdep_class(ndev);
 
 	/* List protected by RTNL */
 	list_add_rcu(&bpq->bpq_list, &bpq_devices);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index d8e86bdbfba1..c0b647a4c893 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2456,6 +2456,8 @@ static int netvsc_probe(struct hv_device *dev,
 		NETIF_F_HW_VLAN_CTAG_RX;
 	net->vlan_features = net->features;
 
+	netdev_lockdep_set_classes(net);
+
 	/* MTU range: 68 - 1500 or 65521 */
 	net->min_mtu = NETVSC_MTU_MIN;
 	if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index f195f278a83a..15e87c097b0b 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -131,6 +131,8 @@ static int ipvlan_init(struct net_device *dev)
 	dev->gso_max_segs = phy_dev->gso_max_segs;
 	dev->hard_header_len = phy_dev->hard_header_len;
 
+	netdev_lockdep_set_classes(dev);
+
 	ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats);
 	if (!ipvlan->pcpu_stats)
 		return -ENOMEM;
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 758baf7cb8a1..ea3f25cc79ef 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -4047,6 +4047,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
+	netdev_lockdep_set_classes(dev);
+
 	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
 		goto unregister;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d45600e0a38c..34eb073cdd74 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -890,6 +890,8 @@ static int macvlan_init(struct net_device *dev)
 	dev->gso_max_segs	= lowerdev->gso_max_segs;
 	dev->hard_header_len	= lowerdev->hard_header_len;
 
+	netdev_lockdep_set_classes(dev);
+
 	vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
 	if (!vlan->pcpu_stats)
 		return -ENOMEM;
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 22cc2cb9d878..7d005896a0f9 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1410,6 +1410,8 @@ static int ppp_dev_init(struct net_device *dev)
 {
 	struct ppp *ppp;
 
+	netdev_lockdep_set_classes(dev);
+
 	ppp = netdev_priv(dev);
 	/* Let the netdevice take a reference on the ppp file. This ensures
 	 * that ppp_destroy_interface() won't run before the device gets
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 04845a4017f9..8c1e02752ff6 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1647,6 +1647,7 @@ static int team_init(struct net_device *dev)
 
 	lockdep_register_key(&team->team_lock_key);
 	__mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key);
+	netdev_lockdep_set_classes(dev);
 
 	return 0;
 
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 56f8aab46f89..43928a1c2f2a 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -867,6 +867,7 @@ static int vrf_dev_init(struct net_device *dev)
 
 	/* similarly, oper state is irrelevant; set to up to avoid confusion */
 	dev->operstate = IF_OPER_UP;
+	netdev_lockdep_set_classes(dev);
 	return 0;
 
 out_rth:
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 58212c532c90..aadf3dec5bf3 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -3041,6 +3041,27 @@ static void prism2_clear_set_tim_queue(local_info_t *local)
 	}
 }
 
+
+/*
+ * HostAP uses two layers of net devices, where the inner
+ * layer gets called all the time from the outer layer.
+ * This is a natural nesting, which needs a split lock type.
+ */
+static struct lock_class_key hostap_netdev_xmit_lock_key;
+
+static void prism2_set_lockdep_class_one(struct net_device *dev,
+					 struct netdev_queue *txq,
+					 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock,
+			  &hostap_netdev_xmit_lock_key);
+}
+
+static void prism2_set_lockdep_class(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL);
+}
+
 static struct net_device *
 prism2_init_local_data(struct prism2_helper_functions *funcs, int card_idx,
 		       struct device *sdev)
@@ -3199,6 +3220,7 @@ while (0)
 	if (ret >= 0)
 		ret = register_netdevice(dev);
 
+	prism2_set_lockdep_class(dev);
 	rtnl_unlock();
 	if (ret < 0) {
 		printk(KERN_WARNING "%s: register netdevice failed!\n",
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5a8d40f1ffe2..7725efd6e48a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1805,13 +1805,11 @@ enum netdev_priv_flags {
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
- *	@qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock
- *				spinlock
- *	@qdisc_running_key:	lockdep class annotating Qdisc->running seqcount
- *	@qdisc_xmit_lock_key:	lockdep class annotating
- *				netdev_queue->_xmit_lock spinlock
+ *
  *	@addr_list_lock_key:	lockdep class annotating
  *				net_device->addr_list_lock spinlock
+ *	@qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
+ *	@qdisc_running_key: lockdep class annotating Qdisc->running seqcount
  *
  *	@proto_down:	protocol port state information can be sent to the
  *			switch driver and used to set the phys state of the
@@ -2112,10 +2110,9 @@ struct net_device {
 #endif
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
-	struct lock_class_key	qdisc_tx_busylock_key;
-	struct lock_class_key	qdisc_running_key;
-	struct lock_class_key	qdisc_xmit_lock_key;
 	struct lock_class_key	addr_list_lock_key;
+	struct lock_class_key	*qdisc_tx_busylock;
+	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 	unsigned		wol_enabled:1;
 
@@ -2200,6 +2197,20 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 		f(dev, &dev->_tx[i], arg);
 }
 
+#define netdev_lockdep_set_classes(dev)				\
+{								\
+	static struct lock_class_key qdisc_tx_busylock_key;	\
+	static struct lock_class_key qdisc_running_key;		\
+	static struct lock_class_key qdisc_xmit_lock_key;	\
+	unsigned int i;						\
+								\
+	(dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;	\
+	(dev)->qdisc_running_key = &qdisc_running_key;		\
+	for (i = 0; i < (dev)->num_tx_queues; i++)		\
+		lockdep_set_class(&(dev)->_tx[i]._xmit_lock,	\
+				  &qdisc_xmit_lock_key);	\
+}
+
 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev);
 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 990b9fde28c6..319220b2341d 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -489,6 +489,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
 	dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
 }
 
+/*
+ * vlan network devices have devices nesting below it, and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key vlan_netdev_xmit_lock_key;
+
+static void vlan_dev_set_lockdep_one(struct net_device *dev,
+				     struct netdev_queue *txq,
+				     void *unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key);
+}
+
+static void vlan_dev_set_lockdep_class(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL);
+}
+
 static const struct header_ops vlan_header_ops = {
 	.create	 = vlan_dev_hard_header,
 	.parse	 = eth_header_parse,
@@ -579,6 +598,8 @@ static int vlan_dev_init(struct net_device *dev)
 
 	SET_NETDEV_DEVTYPE(dev, &vlan_type);
 
+	vlan_dev_set_lockdep_class(dev);
+
 	vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
 	if (!vlan->vlan_pcpu_stats)
 		return -ENOMEM;
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 5f05a728f347..822af540b854 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -739,6 +739,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
 	return 0;
 }
 
+/* batman-adv network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key batadv_netdev_xmit_lock_key;
+
+/**
+ * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue
+ * @dev: device which owns the tx queue
+ * @txq: tx queue to modify
+ * @_unused: always NULL
+ */
+static void batadv_set_lockdep_class_one(struct net_device *dev,
+					 struct netdev_queue *txq,
+					 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key);
+}
+
+/**
+ * batadv_set_lockdep_class() - Set txq and addr_list lockdep class
+ * @dev: network device to modify
+ */
+static void batadv_set_lockdep_class(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL);
+}
+
 /**
  * batadv_softif_init_late() - late stage initialization of soft interface
  * @dev: registered network device to modify
@@ -752,6 +780,8 @@ static int batadv_softif_init_late(struct net_device *dev)
 	int ret;
 	size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM;
 
+	batadv_set_lockdep_class(dev);
+
 	bat_priv = netdev_priv(dev);
 	bat_priv->soft_iface = dev;
 
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 4febc82a7c76..bb55d92691b0 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
 	return err < 0 ? NET_XMIT_DROP : err;
 }
 
+static int bt_dev_init(struct net_device *dev)
+{
+	netdev_lockdep_set_classes(dev);
+
+	return 0;
+}
+
 static const struct net_device_ops netdev_ops = {
+	.ndo_init		= bt_dev_init,
 	.ndo_start_xmit		= bt_xmit,
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index afff16849c26..f8d83922a6af 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 EXPORT_PER_CPU_SYMBOL(softnet_data);
 
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] = {
+	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] = {
+	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+		if (netdev_lock_type[i] == dev_type)
+			return i;
+	/* the last key is used by default */
+	return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+						 unsigned short dev_type)
+{
+	int i;
+
+	i = netdev_lock_pos(dev_type);
+	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+				   netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+						 unsigned short dev_type)
+{
+}
+#endif
+
 /*******************************************************************************
  *
  *		Protocol management and registration routines
@@ -9208,7 +9276,7 @@ static void netdev_init_one_queue(struct net_device *dev,
 {
 	/* Initialize queue lock */
 	spin_lock_init(&queue->_xmit_lock);
-	lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
+	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 	queue->xmit_lock_owner = -1;
 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 	queue->dev = dev;
@@ -9255,22 +9323,6 @@ void netif_tx_stop_all_queues(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_tx_stop_all_queues);
 
-static void netdev_register_lockdep_key(struct net_device *dev)
-{
-	lockdep_register_key(&dev->qdisc_tx_busylock_key);
-	lockdep_register_key(&dev->qdisc_running_key);
-	lockdep_register_key(&dev->qdisc_xmit_lock_key);
-	lockdep_register_key(&dev->addr_list_lock_key);
-}
-
-static void netdev_unregister_lockdep_key(struct net_device *dev)
-{
-	lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
-	lockdep_unregister_key(&dev->qdisc_running_key);
-	lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
-	lockdep_unregister_key(&dev->addr_list_lock_key);
-}
-
 void netdev_update_lockdep_key(struct net_device *dev)
 {
 	lockdep_unregister_key(&dev->addr_list_lock_key);
@@ -9837,7 +9889,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev_net_set(dev, &init_net);
 
-	netdev_register_lockdep_key(dev);
+	lockdep_register_key(&dev->addr_list_lock_key);
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
@@ -9926,7 +9978,7 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->xdp_bulkq);
 	dev->xdp_bulkq = NULL;
 
-	netdev_unregister_lockdep_key(dev);
+	lockdep_unregister_key(&dev->addr_list_lock_key);
 
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ba8bf90dc0cc..fa2634043751 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1671,6 +1671,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	return ret;
 }
 
+static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
+static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
+					    struct netdev_queue *txq,
+					    void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock,
+			  &dsa_slave_netdev_xmit_lock_key);
+}
+
 int dsa_slave_suspend(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
@@ -1754,6 +1763,9 @@ int dsa_slave_create(struct dsa_port *port)
 		slave_dev->max_mtu = ETH_MAX_MTU;
 	SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
 
+	netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
+				 NULL);
+
 	SET_NETDEV_DEV(slave_dev, port->ds->dev);
 	slave_dev->dev.of_node = port->dn;
 	slave_dev->vlan_features = master->vlan_features;
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index c0b107cdd715..3297e7fa9945 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = {
 	.create	= lowpan_header_create,
 };
 
+static int lowpan_dev_init(struct net_device *ldev)
+{
+	netdev_lockdep_set_classes(ldev);
+
+	return 0;
+}
+
 static int lowpan_open(struct net_device *dev)
 {
 	if (!open_count)
@@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev)
 }
 
 static const struct net_device_ops lowpan_netdev_ops = {
+	.ndo_init		= lowpan_dev_init,
 	.ndo_start_xmit		= lowpan_xmit,
 	.ndo_open		= lowpan_open,
 	.ndo_stop		= lowpan_stop,
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index d3b520b9b2c9..fd5ac2788e45 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev)
 {
 	eth_hw_addr_random(dev);
 	eth_broadcast_addr(dev->broadcast);
+	netdev_lockdep_set_classes(dev);
 
 	return 0;
 }
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 7b1a74f74aad..eccc7d366e17 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,6 +63,26 @@ static DEFINE_SPINLOCK(nr_list_lock);
 
 static const struct proto_ops nr_proto_ops;
 
+/*
+ * NETROM network devices are virtual network devices encapsulating NETROM
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key nr_netdev_xmit_lock_key;
+
+static void nr_set_lockdep_one(struct net_device *dev,
+			       struct netdev_queue *txq,
+			       void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key);
+}
+
+static void nr_set_lockdep_key(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL);
+}
+
 /*
  *	Socket removal during an interrupt is now safe.
  */
@@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void)
 			free_netdev(dev);
 			goto fail;
 		}
+		nr_set_lockdep_key(dev);
 		dev_nr[i] = dev;
 	}
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 1e8eeb044b07..e7a872207b46 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -64,6 +64,26 @@ static const struct proto_ops rose_proto_ops;
 
 ax25_address rose_callsign;
 
+/*
+ * ROSE network devices are virtual network devices encapsulating ROSE
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key rose_netdev_xmit_lock_key;
+
+static void rose_set_lockdep_one(struct net_device *dev,
+				 struct netdev_queue *txq,
+				 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key);
+}
+
+static void rose_set_lockdep_key(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL);
+}
+
 /*
  *	Convert a ROSE address into text.
  */
@@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void)
 			free_netdev(dev);
 			goto fail;
 		}
+		rose_set_lockdep_key(dev);
 		dev_rose[i] = dev;
 	}
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ad24fa1a51e6..ebc55d884247 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 };
 EXPORT_SYMBOL(pfifo_fast_ops);
 
+static struct lock_class_key qdisc_tx_busylock;
+static struct lock_class_key qdisc_running_key;
+
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops,
 			  struct netlink_ext_ack *extack)
@@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	}
 
 	spin_lock_init(&sch->busylock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
 	spin_lock_init(&sch->seqlock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	seqcount_init(&sch->running);
+	lockdep_set_class(&sch->running,
+			  dev->qdisc_running_key ?: &qdisc_running_key);
 
 	sch->ops = ops;
 	sch->flags = ops->static_flags;
@@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	dev_hold(dev);
 	refcount_set(&sch->refcnt, 1);
 
-	if (sch != &noop_qdisc) {
-		lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
-		lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
-		lockdep_set_class(&sch->running, &dev->qdisc_running_key);
-	}
-
 	return sch;
 errout1:
 	kfree(p);
-- 
cgit v1.2.3


From e4e5aefc113510c03d34e182ab30bc0cc196675c Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 May 2020 15:33:51 +0200
Subject: xsk: Change two variable names for increased clarity

Change two variables names so that it is clearer what they
represent. The first one is xsk_list that in fact only contains the
list of AF_XDP sockets with a Tx component. Change this to xsk_tx_list
for improved clarity. The second variable is size in the ring
structure. One might think that this is the size of the ring, but it
is in fact the size of the umem, copied into the ring structure to
improve performance. Rename this variable umem_size to avoid any
confusion.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Link: https://lore.kernel.org/bpf/1588599232-24897-2-git-send-email-magnus.karlsson@intel.com
---
 include/net/xdp_sock.h |  4 ++--
 net/xdp/xdp_umem.c     | 14 +++++++-------
 net/xdp/xsk.c          |  8 ++++----
 net/xdp/xsk_queue.c    |  4 ++--
 net/xdp/xsk_queue.h    |  8 ++++----
 5 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e86ec48ef627..b72f1f4c3b15 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -62,8 +62,8 @@ struct xdp_umem {
 	struct net_device *dev;
 	struct xdp_umem_fq_reuse *fq_reuse;
 	bool zc;
-	spinlock_t xsk_list_lock;
-	struct list_head xsk_list;
+	spinlock_t xsk_tx_list_lock;
+	struct list_head xsk_tx_list;
 };
 
 /* Nodes are linked in the struct xdp_sock map_list field, and used to
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index ed7a6060f73c..7211f4572760 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
 	if (!xs->tx)
 		return;
 
-	spin_lock_irqsave(&umem->xsk_list_lock, flags);
-	list_add_rcu(&xs->list, &umem->xsk_list);
-	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
+	list_add_rcu(&xs->list, &umem->xsk_tx_list);
+	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
 }
 
 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
@@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
 	if (!xs->tx)
 		return;
 
-	spin_lock_irqsave(&umem->xsk_list_lock, flags);
+	spin_lock_irqsave(&umem->xsk_tx_list_lock, flags);
 	list_del_rcu(&xs->list);
-	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+	spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags);
 }
 
 /* The umem is stored both in the _rx struct and the _tx struct as we do
@@ -395,8 +395,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->pgs = NULL;
 	umem->user = NULL;
 	umem->flags = mr->flags;
-	INIT_LIST_HEAD(&umem->xsk_list);
-	spin_lock_init(&umem->xsk_list_lock);
+	INIT_LIST_HEAD(&umem->xsk_tx_list);
+	spin_lock_init(&umem->xsk_tx_list_lock);
 
 	refcount_set(&umem->users, 1);
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f6e6609f70a3..45ffd67b367d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -75,7 +75,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
 		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
 	}
 	rcu_read_unlock();
@@ -102,7 +102,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
 		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
 	}
 	rcu_read_unlock();
@@ -305,7 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 	struct xdp_sock *xs;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
 		__xskq_cons_release(xs->tx);
 		xs->sk.sk_write_space(&xs->sk);
 	}
@@ -318,7 +318,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 	struct xdp_sock *xs;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+	list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) {
 		if (!xskq_cons_peek_desc(xs->tx, desc, umem))
 			continue;
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index c90e9c1e3c63..57fb81bd593c 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -9,12 +9,12 @@
 
 #include "xsk_queue.h"
 
-void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
+void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
 {
 	if (!q)
 		return;
 
-	q->size = size;
+	q->umem_size = umem_size;
 	q->chunk_mask = chunk_mask;
 }
 
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b50bb5c76da5..648733ec24ac 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -30,7 +30,7 @@ struct xdp_umem_ring {
 
 struct xsk_queue {
 	u64 chunk_mask;
-	u64 size;
+	u64 umem_size;
 	u32 ring_mask;
 	u32 nentries;
 	u32 cached_prod;
@@ -123,7 +123,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
 	u64 base_addr = xsk_umem_extract_addr(addr);
 
 	addr = xsk_umem_add_offset_to_addr(addr);
-	if (base_addr >= q->size || addr >= q->size ||
+	if (base_addr >= q->umem_size || addr >= q->umem_size ||
 	    xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
 		q->invalid_descs++;
 		return false;
@@ -134,7 +134,7 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
 
 static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (addr >= q->size) {
+	if (addr >= q->umem_size) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -379,7 +379,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
 	return q ? q->invalid_descs : 0;
 }
 
-void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
+void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
-- 
cgit v1.2.3


From 07bf2d97d1f37e7ac8d7be2d84ff108d43556a1d Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 May 2020 15:33:52 +0200
Subject: xsk: Remove unnecessary member in xdp_umem

Remove the unnecessary member of address in struct xdp_umem as it is
only used during the umem registration. No need to carry this around
as it is not used during run-time nor when unregistering the umem.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Link: https://lore.kernel.org/bpf/1588599232-24897-3-git-send-email-magnus.karlsson@intel.com
---
 include/net/xdp_sock.h | 1 -
 net/xdp/xdp_umem.c     | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index b72f1f4c3b15..67191ccaab85 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -50,7 +50,6 @@ struct xdp_umem {
 	u32 headroom;
 	u32 chunk_size_nohr;
 	struct user_struct *user;
-	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
 	struct page **pgs;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 7211f4572760..37ace3bc0d48 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -279,7 +279,7 @@ void xdp_put_umem(struct xdp_umem *umem)
 	}
 }
 
-static int xdp_umem_pin_pages(struct xdp_umem *umem)
+static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
 {
 	unsigned int gup_flags = FOLL_WRITE;
 	long npgs;
@@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
 		return -ENOMEM;
 
 	down_read(&current->mm->mmap_sem);
-	npgs = pin_user_pages(umem->address, umem->npgs,
+	npgs = pin_user_pages(address, umem->npgs,
 			      gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
 	up_read(&current->mm->mmap_sem);
 
@@ -385,7 +385,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
 		return -EINVAL;
 
-	umem->address = (unsigned long)addr;
 	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
 					    : ~((u64)chunk_size - 1);
 	umem->size = size;
@@ -404,7 +403,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (err)
 		return err;
 
-	err = xdp_umem_pin_pages(umem);
+	err = xdp_umem_pin_pages(umem, (unsigned long)addr);
 	if (err)
 		goto out_account;
 
-- 
cgit v1.2.3


From 592138a88d967ae9279275ef275b729e866a552a Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 5 May 2020 16:47:36 +0800
Subject: net: sched: choke: Remove unused inline function choke_set_classid

There's no callers in-tree anymore since commit 5952fde10c35 ("net:
sched: choke: remove dead filter classify code")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_choke.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index a36974e9c601..2d350c734375 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -142,11 +142,6 @@ static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
-static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
-{
-	choke_skb_cb(skb)->classid = classid;
-}
-
 /*
  * Compare flow of two packets
  *  Returns true only if source and destination address and port match.
-- 
cgit v1.2.3


From fe121e078da1fd8a061ab22f26c5911f8ebf46cb Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 May 2020 12:08:02 -0700
Subject: sch_choke: Remove classid from choke_skb_cb.

Suggested by Cong Wang.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_choke.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 2d350c734375..59ff466ec7cb 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -131,7 +131,6 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
 }
 
 struct choke_skb_cb {
-	u16			classid;
 	u8			keys_valid;
 	struct			flow_keys_digest keys;
 };
-- 
cgit v1.2.3


From 0a99be434d145079d0509473b19e840629d851c2 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 5 May 2020 15:01:20 +0200
Subject: net/smc: log important pnetid and state change events

Print to system log when SMC links are available or go down, link group
state changes or pnetids are applied to and removed from devices.
The log entries are triggered by either user configuration actions or
adapter activation/deactivation events and are not expected to happen
often. The entries help SMC users to keep track of the SMC link group
status and to detect when actions are needed (like to add replacements
for failed adapters).

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  6 ++----
 net/smc/smc_core.c | 34 +++++++++++++++++++++++++++++-----
 net/smc/smc_core.h |  2 +-
 net/smc/smc_ib.c   | 11 +++++++++++
 net/smc/smc_ism.c  |  6 ++++++
 net/smc/smc_llc.c  | 25 +++++++++++++++++++------
 net/smc/smc_llc.h  |  2 +-
 net/smc/smc_pnet.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 8 files changed, 113 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4e4421c95ca1..903321543838 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -378,8 +378,6 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 	struct smc_llc_qentry *qentry;
 	int rc;
 
-	link->lgr->type = SMC_LGR_SINGLE;
-
 	/* receive CONFIRM LINK request from server over RoCE fabric */
 	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
 			      SMC_LLC_CONFIRM_LINK);
@@ -414,6 +412,7 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 		return SMC_CLC_DECL_TIMEOUT_CL;
 
 	smc_llc_link_active(link);
+	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
 
 	/* optional 2nd link, receive ADD LINK request from server */
 	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
@@ -1037,8 +1036,6 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 	struct smc_llc_qentry *qentry;
 	int rc;
 
-	link->lgr->type = SMC_LGR_SINGLE;
-
 	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
@@ -1067,6 +1064,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 	smc->conn.rmb_desc->is_conf_rkey = true;
 
 	smc_llc_link_active(link);
+	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
 
 	/* initial contact - try to establish second link */
 	smc_llc_srv_add_link(link);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index fb5f685ff494..65de700e1f17 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -369,7 +369,7 @@ dealloc_pd:
 free_link_mem:
 	smc_wr_free_link_mem(lnk);
 clear_llc_lnk:
-	smc_llc_link_clear(lnk);
+	smc_llc_link_clear(lnk, false);
 out:
 	put_device(&ini->ib_dev->ibdev->dev);
 	memset(lnk, 0, sizeof(struct smc_link));
@@ -718,14 +718,14 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk)
 }
 
 /* must be called under lgr->llc_conf_mutex lock */
-void smcr_link_clear(struct smc_link *lnk)
+void smcr_link_clear(struct smc_link *lnk, bool log)
 {
 	struct smc_ib_device *smcibdev;
 
 	if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED)
 		return;
 	lnk->peer_qpn = 0;
-	smc_llc_link_clear(lnk);
+	smc_llc_link_clear(lnk, log);
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_reset(lnk);
@@ -812,7 +812,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 		mutex_lock(&lgr->llc_conf_mutex);
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
-				smcr_link_clear(&lgr->lnk[i]);
+				smcr_link_clear(&lgr->lnk[i], false);
 		}
 		mutex_unlock(&lgr->llc_conf_mutex);
 		smc_llc_lgr_clear(lgr);
@@ -1040,12 +1040,36 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
 /* set new lgr type and clear all asymmetric link tagging */
 void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
 {
+	char *lgr_type = "";
 	int i;
 
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
 		if (smc_link_usable(&lgr->lnk[i]))
 			lgr->lnk[i].link_is_asym = false;
+	if (lgr->type == new_type)
+		return;
 	lgr->type = new_type;
+
+	switch (lgr->type) {
+	case SMC_LGR_NONE:
+		lgr_type = "NONE";
+		break;
+	case SMC_LGR_SINGLE:
+		lgr_type = "SINGLE";
+		break;
+	case SMC_LGR_SYMMETRIC:
+		lgr_type = "SYMMETRIC";
+		break;
+	case SMC_LGR_ASYMMETRIC_PEER:
+		lgr_type = "ASYMMETRIC_PEER";
+		break;
+	case SMC_LGR_ASYMMETRIC_LOCAL:
+		lgr_type = "ASYMMETRIC_LOCAL";
+		break;
+	}
+	pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: "
+			    "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
+			    lgr_type, lgr->pnet_id);
 }
 
 /* set new lgr type and tag a link as asymmetric */
@@ -1146,7 +1170,7 @@ static void smcr_link_down(struct smc_link *lnk)
 	smc_ib_modify_qp_reset(lnk);
 	to_lnk = smc_switch_conns(lgr, lnk, true);
 	if (!to_lnk) { /* no backup link available */
-		smcr_link_clear(lnk);
+		smcr_link_clear(lnk, true);
 		return;
 	}
 	smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 4ae76802214f..86d160f0d187 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -383,7 +383,7 @@ void smc_core_exit(void);
 
 int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 		   u8 link_idx, struct smc_init_info *ini);
-void smcr_link_clear(struct smc_link *lnk);
+void smcr_link_clear(struct smc_link *lnk, bool log);
 int smcr_buf_map_lgr(struct smc_link *lnk);
 int smcr_buf_reg_lgr(struct smc_link *lnk);
 void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 2c743caad69a..f0a5064bf9bd 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -575,6 +575,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
 
 	/* trigger reading of the port attributes */
 	port_cnt = smcibdev->ibdev->phys_port_cnt;
+	pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
+			    smcibdev->ibdev->name, port_cnt);
 	for (i = 0;
 	     i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
 	     i++) {
@@ -583,6 +585,13 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
 		if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
 					   smcibdev->pnetid[i]))
 			smc_pnetid_by_table_ib(smcibdev, i + 1);
+		pr_warn_ratelimited("smc:    ib device %s port %d has pnetid "
+				    "%.16s%s\n",
+				    smcibdev->ibdev->name, i + 1,
+				    smcibdev->pnetid[i],
+				    smcibdev->pnetid_by_user[i] ?
+				     " (user defined)" :
+				     "");
 	}
 	schedule_work(&smcibdev->port_event_work);
 }
@@ -599,6 +608,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
 	spin_lock(&smc_ib_devices.lock);
 	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
 	spin_unlock(&smc_ib_devices.lock);
+	pr_warn_ratelimited("smc: removing ib device %s\n",
+			    smcibdev->ibdev->name);
 	smc_smcr_terminate_all(smcibdev);
 	smc_ib_cleanup_per_ibdev(smcibdev);
 	ib_unregister_event_handler(&smcibdev->event_handler);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 32be2da2cb85..91f85fc09fb8 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -321,12 +321,18 @@ int smcd_register_dev(struct smcd_dev *smcd)
 	list_add_tail(&smcd->list, &smcd_dev_list.list);
 	spin_unlock(&smcd_dev_list.lock);
 
+	pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+			    dev_name(&smcd->dev), smcd->pnetid,
+			    smcd->pnetid_by_user ? " (user defined)" : "");
+
 	return device_add(&smcd->dev);
 }
 EXPORT_SYMBOL_GPL(smcd_register_dev);
 
 void smcd_unregister_dev(struct smcd_dev *smcd)
 {
+	pr_warn_ratelimited("smc: removing smcd device %s\n",
+			    dev_name(&smcd->dev));
 	spin_lock(&smcd_dev_list.lock);
 	list_del_init(&smcd->list);
 	spin_unlock(&smcd_dev_list.lock);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 66ddc9cf5e2f..4cc583678ac7 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -870,7 +870,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	if (!rc)
 		goto out;
 out_clear_lnk:
-	smcr_link_clear(lnk_new);
+	smcr_link_clear(lnk_new, false);
 out_reject:
 	smc_llc_cli_add_link_reject(qentry);
 out:
@@ -977,7 +977,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
 	}
 	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 out_free:
-	smcr_link_clear(lnk_asym);
+	smcr_link_clear(lnk_asym, true);
 }
 
 static int smc_llc_srv_rkey_exchange(struct smc_link *link,
@@ -1121,7 +1121,7 @@ int smc_llc_srv_add_link(struct smc_link *link)
 		goto out_err;
 	return 0;
 out_err:
-	smcr_link_clear(link_new);
+	smcr_link_clear(link_new, false);
 	return rc;
 }
 
@@ -1227,7 +1227,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
 		smc_switch_conns(lgr, lnk_del, false);
 		smc_wr_tx_wait_no_pending_sends(lnk_del);
 	}
-	smcr_link_clear(lnk_del);
+	smcr_link_clear(lnk_del, true);
 
 	active_links = smc_llc_active_link_count(lgr);
 	if (lnk_del == lnk_asym) {
@@ -1320,7 +1320,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 			}
 		}
 	}
-	smcr_link_clear(lnk_del);
+	smcr_link_clear(lnk_del, true);
 
 	active_links = smc_llc_active_link_count(lgr);
 	if (active_links == 1) {
@@ -1711,6 +1711,12 @@ int smc_llc_link_init(struct smc_link *link)
 
 void smc_llc_link_active(struct smc_link *link)
 {
+	pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, "
+			    "peerid %*phN, ibdev %s, ibport %d\n",
+			    SMC_LGR_ID_SIZE, &link->lgr->id,
+			    SMC_LGR_ID_SIZE, &link->link_uid,
+			    SMC_LGR_ID_SIZE, &link->peer_link_uid,
+			    link->smcibdev->ibdev->name, link->ibport);
 	link->state = SMC_LNK_ACTIVE;
 	if (link->lgr->llc_testlink_time) {
 		link->llc_testlink_time = link->lgr->llc_testlink_time * HZ;
@@ -1720,8 +1726,15 @@ void smc_llc_link_active(struct smc_link *link)
 }
 
 /* called in worker context */
-void smc_llc_link_clear(struct smc_link *link)
+void smc_llc_link_clear(struct smc_link *link, bool log)
 {
+	if (log)
+		pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN"
+				    ", peerid %*phN, ibdev %s, ibport %d\n",
+				    SMC_LGR_ID_SIZE, &link->lgr->id,
+				    SMC_LGR_ID_SIZE, &link->link_uid,
+				    SMC_LGR_ID_SIZE, &link->peer_link_uid,
+				    link->smcibdev->ibdev->name, link->ibport);
 	complete(&link->llc_testlink_resp);
 	cancel_delayed_work_sync(&link->llc_testlink_wrk);
 	smc_wr_wakeup_reg_wait(link);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 55287376112d..a5d2fe3eea61 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -82,7 +82,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
 void smc_llc_lgr_clear(struct smc_link_group *lgr);
 int smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link);
-void smc_llc_link_clear(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link, bool log);
 int smc_llc_do_confirm_rkey(struct smc_link *send_link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 50c96e843fab..be03f1260d59 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -110,8 +110,14 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 		if (!pnet_name ||
 		    smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
 			list_del(&pnetelem->list);
-			if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev)
+			if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
 				dev_put(pnetelem->ndev);
+				pr_warn_ratelimited("smc: net device %s "
+						    "erased user defined "
+						    "pnetid %.16s\n",
+						    pnetelem->eth_name,
+						    pnetelem->pnet_name);
+			}
 			kfree(pnetelem);
 			rc = 0;
 		}
@@ -130,6 +136,12 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 			    (!pnet_name ||
 			     smc_pnet_match(pnet_name,
 					    ibdev->pnetid[ibport]))) {
+				pr_warn_ratelimited("smc: ib device %s ibport "
+						    "%d erased user defined "
+						    "pnetid %.16s\n",
+						    ibdev->ibdev->name,
+						    ibport + 1,
+						    ibdev->pnetid[ibport]);
 				memset(ibdev->pnetid[ibport], 0,
 				       SMC_MAX_PNETID_LEN);
 				ibdev->pnetid_by_user[ibport] = false;
@@ -144,6 +156,10 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
 		if (smcd_dev->pnetid_by_user &&
 		    (!pnet_name ||
 		     smc_pnet_match(pnet_name, smcd_dev->pnetid))) {
+			pr_warn_ratelimited("smc: smcd device %s "
+					    "erased user defined pnetid "
+					    "%.16s\n", dev_name(&smcd_dev->dev),
+					    smcd_dev->pnetid);
 			memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN);
 			smcd_dev->pnetid_by_user = false;
 			rc = 0;
@@ -174,6 +190,10 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev)
 			dev_hold(ndev);
 			pnetelem->ndev = ndev;
 			rc = 0;
+			pr_warn_ratelimited("smc: adding net device %s with "
+					    "user defined pnetid %.16s\n",
+					    pnetelem->eth_name,
+					    pnetelem->pnet_name);
 			break;
 		}
 	}
@@ -201,6 +221,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
 			dev_put(pnetelem->ndev);
 			pnetelem->ndev = NULL;
 			rc = 0;
+			pr_warn_ratelimited("smc: removing net device %s with "
+					    "user defined pnetid %.16s\n",
+					    pnetelem->eth_name,
+					    pnetelem->pnet_name);
 			break;
 		}
 	}
@@ -357,6 +381,10 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
 		kfree(new_pe);
 		goto out_put;
 	}
+	if (ndev)
+		pr_warn_ratelimited("smc: net device %s "
+				    "applied user defined pnetid %.16s\n",
+				    new_pe->eth_name, new_pe->pnet_name);
 	return 0;
 
 out_put:
@@ -377,11 +405,24 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
 
 	/* try to apply the pnetid to active devices */
 	ib_dev = smc_pnet_find_ib(ib_name);
-	if (ib_dev)
+	if (ib_dev) {
 		ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
+		if (ibdev_applied)
+			pr_warn_ratelimited("smc: ib device %s ibport %d "
+					    "applied user defined pnetid "
+					    "%.16s\n", ib_dev->ibdev->name,
+					    ib_port,
+					    ib_dev->pnetid[ib_port - 1]);
+	}
 	smcd_dev = smc_pnet_find_smcd(ib_name);
-	if (smcd_dev)
+	if (smcd_dev) {
 		smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name);
+		if (smcddev_applied)
+			pr_warn_ratelimited("smc: smcd device %s "
+					    "applied user defined pnetid "
+					    "%.16s\n", dev_name(&smcd_dev->dev),
+					    smcd_dev->pnetid);
+	}
 	/* Apply fails when a device has a hardware-defined pnetid set, do not
 	 * add a pnet table entry in that case.
 	 */
-- 
cgit v1.2.3


From fea805237dd984a71a2c5e5cf074a15505d5ba31 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 5 May 2020 15:01:21 +0200
Subject: net/smc: remove unused inline function smc_curs_read

commit bac6de7b6370 ("net/smc: eliminate cursor read and write calls")
left behind this.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 9cfabc9af120..2ddcc5fb5ceb 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs,
 	}
 }
 
-/* SMC cursors are 8 bytes long and require atomic reading and writing */
-static inline u64 smc_curs_read(union smc_host_cursor *curs,
-				struct smc_connection *conn)
-{
-#ifndef KERNEL_HAS_ATOMIC64
-	unsigned long flags;
-	u64 ret;
-
-	spin_lock_irqsave(&conn->acurs_lock, flags);
-	ret = curs->acurs;
-	spin_unlock_irqrestore(&conn->acurs_lock, flags);
-	return ret;
-#else
-	return atomic64_read(&curs->acurs);
-#endif
-}
-
 /* Copy cursor src into tgt */
 static inline void smc_curs_copy(union smc_host_cursor *tgt,
 				 union smc_host_cursor *src,
-- 
cgit v1.2.3


From f989d546a2d5a9f001f6f8be49d98c10ab9b1897 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 5 May 2020 09:05:06 -0700
Subject: erspan: Add type I version 0 support.

The Type I ERSPAN frame format is based on the barebones
IP + GRE(4-byte) encapsulation on top of the raw mirrored frame.
Both type I and II use 0x88BE as protocol type. Unlike type II
and III, no sequence number or key is required.
To creat a type I erspan tunnel device:
  $ ip link add dev erspan11 type erspan \
            local 172.16.1.100 remote 172.16.1.200 \
            erspan_ver 0

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h | 19 +++++++++++++++--
 net/ipv4/ip_gre.c    | 58 ++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/erspan.h b/include/net/erspan.h
index b39643ef4c95..0d9e86bd9893 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -2,7 +2,19 @@
 #define __LINUX_ERSPAN_H
 
 /*
- * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes
+ * GRE header for ERSPAN type I encapsulation (4 octets [34:37])
+ *      0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |0|0|0|0|0|00000|000000000|00000|    Protocol Type for ERSPAN   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *  The Type I ERSPAN frame format is based on the barebones IP + GRE
+ *  encapsulation (as described above) on top of the raw mirrored frame.
+ *  There is no extra ERSPAN header.
+ *
+ *
+ * GRE header for ERSPAN type II and II encapsulation (8 octets [34:41])
  *       0                   1                   2                   3
  *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -43,7 +55,7 @@
  * |                  Platform Specific Info                       |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
- * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
+ * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB
  */
 
 #include <uapi/linux/erspan.h>
@@ -139,6 +151,9 @@ static inline u8 get_hwid(const struct erspan_md2 *md2)
 
 static inline int erspan_hdr_len(int version)
 {
+	if (version == 0)
+		return 0;
+
 	return sizeof(struct erspan_base_hdr) +
 	       (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE);
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 029b24eeafba..e29cd48674d7 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -248,6 +248,15 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
+static bool is_erspan_type1(int gre_hdr_len)
+{
+	/* Both ERSPAN type I (version 0) and type II (version 1) use
+	 * protocol 0x88BE, but the type I has only 4-byte GRE header,
+	 * while type II has 8-byte.
+	 */
+	return gre_hdr_len == 4;
+}
+
 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 		      int gre_hdr_len)
 {
@@ -262,17 +271,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	int len;
 
 	itn = net_generic(net, erspan_net_id);
-
 	iph = ip_hdr(skb);
-	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
-	ver = ershdr->ver;
-
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
-				  tpi->flags | TUNNEL_KEY,
-				  iph->saddr, iph->daddr, tpi->key);
+	if (is_erspan_type1(gre_hdr_len)) {
+		ver = 0;
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+					  tpi->flags | TUNNEL_NO_KEY,
+					  iph->saddr, iph->daddr, 0);
+	} else {
+		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+		ver = ershdr->ver;
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+					  tpi->flags | TUNNEL_KEY,
+					  iph->saddr, iph->daddr, tpi->key);
+	}
 
 	if (tunnel) {
-		len = gre_hdr_len + erspan_hdr_len(ver);
+		if (is_erspan_type1(gre_hdr_len))
+			len = gre_hdr_len;
+		else
+			len = gre_hdr_len + erspan_hdr_len(ver);
+
 		if (unlikely(!pskb_may_pull(skb, len)))
 			return PACKET_REJECT;
 
@@ -665,7 +683,10 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	}
 
 	/* Push ERSPAN header */
-	if (tunnel->erspan_ver == 1) {
+	if (tunnel->erspan_ver == 0) {
+		proto = htons(ETH_P_ERSPAN);
+		tunnel->parms.o_flags &= ~TUNNEL_SEQ;
+	} else if (tunnel->erspan_ver == 1) {
 		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 				    tunnel->index,
 				    truncate, true);
@@ -1066,7 +1087,10 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (ret)
 		return ret;
 
-	/* ERSPAN should only have GRE sequence and key flag */
+	if (nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
+		return 0;
+
+	/* ERSPAN type II/III should only have GRE sequence and key flag */
 	if (data[IFLA_GRE_OFLAGS])
 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
 	if (data[IFLA_GRE_IFLAGS])
@@ -1174,7 +1198,7 @@ static int erspan_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_ERSPAN_VER]) {
 		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
 
-		if (t->erspan_ver != 1 && t->erspan_ver != 2)
+		if (t->erspan_ver > 2)
 			return -EINVAL;
 	}
 
@@ -1259,7 +1283,11 @@ static int erspan_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
-	tunnel->tun_hlen = 8;
+	if (tunnel->erspan_ver == 0)
+		tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
+	else
+		tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
+
 	tunnel->parms.iph.protocol = IPPROTO_GRE;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
 		       erspan_hdr_len(tunnel->erspan_ver);
@@ -1456,8 +1484,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	struct ip_tunnel_parm *p = &t->parms;
 	__be16 o_flags = p->o_flags;
 
-	if (t->erspan_ver == 1 || t->erspan_ver == 2) {
-		if (!t->collect_md)
+	if (t->erspan_ver <= 2) {
+		if (t->erspan_ver != 0 && !t->collect_md)
 			o_flags |= TUNNEL_KEY;
 
 		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
@@ -1466,7 +1494,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		if (t->erspan_ver == 1) {
 			if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
 				goto nla_put_failure;
-		} else {
+		} else if (t->erspan_ver == 2) {
 			if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
 				goto nla_put_failure;
 			if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
-- 
cgit v1.2.3


From 6d64be3da282908bb17b0803b9edad8852ffea56 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:03 +0200
Subject: xfrm: avoid extract_output indirection for ipv4

We can use a direct call for ipv4, so move the needed functions
to net/xfrm/xfrm_output.c and call them directly.

For ipv6 the indirection can be avoided as well but it will need
a bit more work -- to ease review it will be done in another patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  1 -
 net/ipv4/xfrm4_output.c | 40 ----------------------------------------
 net/ipv4/xfrm4_state.c  |  1 -
 net/xfrm/xfrm_output.c  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2577666c34c8..397007324abd 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1580,7 +1580,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 	return xfrm_input(skb, nexthdr, spi, 0);
 }
 
-int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 89ba7c87de5d..21c8fa0a31ed 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -14,46 +14,6 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-static int xfrm4_tunnel_check_size(struct sk_buff *skb)
-{
-	int mtu, ret = 0;
-
-	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
-		goto out;
-
-	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
-		goto out;
-
-	mtu = dst_mtu(skb_dst(skb));
-	if ((!skb_is_gso(skb) && skb->len > mtu) ||
-	    (skb_is_gso(skb) &&
-	     !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) {
-		skb->protocol = htons(ETH_P_IP);
-
-		if (skb->sk)
-			xfrm_local_error(skb, mtu);
-		else
-			icmp_send(skb, ICMP_DEST_UNREACH,
-				  ICMP_FRAG_NEEDED, htonl(mtu));
-		ret = -EMSGSIZE;
-	}
-out:
-	return ret;
-}
-
-int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err;
-
-	err = xfrm4_tunnel_check_size(skb);
-	if (err)
-		return err;
-
-	XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
-
-	return xfrm4_extract_header(skb);
-}
-
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
 {
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index f8ed3c3bb928..d7c200779e4f 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -37,7 +37,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
 	.extract_input		= xfrm4_extract_input,
-	.extract_output		= xfrm4_extract_output,
 	.transport_finish	= xfrm4_transport_finish,
 	.local_error		= xfrm4_local_error,
 };
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 2fd3d990d992..a7b3af7f7a1e 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <net/dst.h>
+#include <net/icmp.h>
 #include <net/inet_ecn.h>
 #include <net/xfrm.h>
 
@@ -609,6 +610,47 @@ out:
 }
 EXPORT_SYMBOL_GPL(xfrm_output);
 
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+	int mtu, ret = 0;
+
+	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+		goto out;
+
+	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
+		goto out;
+
+	mtu = dst_mtu(skb_dst(skb));
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) &&
+	     !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) {
+		skb->protocol = htons(ETH_P_IP);
+
+		if (skb->sk)
+			xfrm_local_error(skb, mtu);
+		else
+			icmp_send(skb, ICMP_DEST_UNREACH,
+				  ICMP_FRAG_NEEDED, htonl(mtu));
+		ret = -EMSGSIZE;
+	}
+out:
+	return ret;
+}
+
+static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	err = xfrm4_tunnel_check_size(skb);
+	if (err)
+		return err;
+
+	XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
+
+	xfrm4_extract_header(skb);
+	return 0;
+}
+
 static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	const struct xfrm_state_afinfo *afinfo;
@@ -624,6 +666,10 @@ static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 	if (inner_mode == NULL)
 		return -EAFNOSUPPORT;
 
+	switch (inner_mode->family) {
+	case AF_INET:
+		return xfrm4_extract_output(x, skb);
+	}
 	rcu_read_lock();
 	afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
 	if (likely(afinfo))
-- 
cgit v1.2.3


From a269fbfc4e9ffe48c1f8142e60a49b6f2e588c58 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:04 +0200
Subject: xfrm: state: remove extract_input indirection from xfrm_state_afinfo

In order to keep CONFIG_IPV6=m working, xfrm6_extract_header needs to be
duplicated.  It will be removed again in a followup change when the
remaining caller is moved to net/xfrm as well.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  3 ---
 net/ipv4/xfrm4_input.c  |  5 -----
 net/ipv4/xfrm4_state.c  |  1 -
 net/ipv6/xfrm6_input.c  |  5 -----
 net/ipv6/xfrm6_output.c | 17 ++++++++++++++++-
 net/ipv6/xfrm6_state.c  | 24 ------------------------
 net/xfrm/xfrm_inout.h   | 18 ++++++++++++++++++
 net/xfrm/xfrm_input.c   | 21 +++++++++++----------
 8 files changed, 45 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 397007324abd..a21c1dea5340 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -362,8 +362,6 @@ struct xfrm_state_afinfo {
 
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int			(*output_finish)(struct sock *sk, struct sk_buff *skb);
-	int			(*extract_input)(struct xfrm_state *x,
-						 struct sk_buff *skb);
 	int			(*extract_output)(struct xfrm_state *x,
 						  struct sk_buff *skb);
 	int			(*transport_finish)(struct sk_buff *skb,
@@ -1587,7 +1585,6 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char prot
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
 int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
 void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
-int xfrm6_extract_header(struct sk_buff *skb);
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
 		  struct ip6_tnl *t);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f8de2482a529..ad2afeef4f10 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -18,11 +18,6 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	return xfrm4_extract_header(skb);
-}
-
 static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
 				   struct sk_buff *skb)
 {
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index d7c200779e4f..521fc1bc069c 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -36,7 +36,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.proto			= IPPROTO_IPIP,
 	.output			= xfrm4_output,
 	.output_finish		= xfrm4_output_finish,
-	.extract_input		= xfrm4_extract_input,
 	.transport_finish	= xfrm4_transport_finish,
 	.local_error		= xfrm4_local_error,
 };
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 56f52353b324..04cbeefd8982 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -17,11 +17,6 @@
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	return xfrm6_extract_header(skb);
-}
-
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
 		  struct ip6_tnl *t)
 {
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fbe51d40bd7e..855078a43fc7 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -94,6 +94,20 @@ out:
 	return ret;
 }
 
+static void __xfrm6_extract_header(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+	XFRM_MODE_SKB_CB(skb)->id = 0;
+	XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
+	XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
+	XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
+	XFRM_MODE_SKB_CB(skb)->optlen = 0;
+	memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
+	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+}
+
 int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err;
@@ -104,7 +118,8 @@ int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 
 	XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
 
-	return xfrm6_extract_header(skb);
+	__xfrm6_extract_header(skb);
+	return 0;
 }
 
 int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 78daadecbdef..8fbf5a68ee6e 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -13,36 +13,12 @@
  */
 
 #include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/export.h>
-#include <net/dsfield.h>
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-
-int xfrm6_extract_header(struct sk_buff *skb)
-{
-	struct ipv6hdr *iph = ipv6_hdr(skb);
-
-	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
-	XFRM_MODE_SKB_CB(skb)->id = 0;
-	XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
-	XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
-	XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
-	XFRM_MODE_SKB_CB(skb)->optlen = 0;
-	memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
-	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-
-	return 0;
-}
 
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
 	.proto			= IPPROTO_IPV6,
 	.output			= xfrm6_output,
 	.output_finish		= xfrm6_output_finish,
-	.extract_input		= xfrm6_extract_input,
 	.extract_output		= xfrm6_extract_output,
 	.transport_finish	= xfrm6_transport_finish,
 	.local_error		= xfrm6_local_error,
diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h
index c7b0318938e2..e24abac92dc2 100644
--- a/net/xfrm/xfrm_inout.h
+++ b/net/xfrm/xfrm_inout.h
@@ -6,6 +6,24 @@
 #ifndef XFRM_INOUT_H
 #define XFRM_INOUT_H 1
 
+static inline void xfrm6_extract_header(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+	XFRM_MODE_SKB_CB(skb)->id = 0;
+	XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
+	XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
+	XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
+	XFRM_MODE_SKB_CB(skb)->optlen = 0;
+	memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
+	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+#else
+	WARN_ON_ONCE(1);
+#endif
+}
+
 static inline void xfrm6_beet_make_header(struct sk_buff *skb)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index aa35f23c4912..6db266a0cb2d 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -353,17 +353,18 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x,
 static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	const struct xfrm_mode *inner_mode = &x->inner_mode;
-	const struct xfrm_state_afinfo *afinfo;
-	int err = -EAFNOSUPPORT;
-
-	rcu_read_lock();
-	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
-	if (likely(afinfo))
-		err = afinfo->extract_input(x, skb);
-	rcu_read_unlock();
 
-	if (err)
-		return err;
+	switch (x->outer_mode.family) {
+	case AF_INET:
+		xfrm4_extract_header(skb);
+		break;
+	case AF_INET6:
+		xfrm6_extract_header(skb);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EAFNOSUPPORT;
+	}
 
 	if (x->sel.family == AF_UNSPEC) {
 		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
-- 
cgit v1.2.3


From 171916cbd53dec5c7b05efb56a201671d92effc1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:05 +0200
Subject: xfrm: move xfrm4_extract_header to common helper

The function only initializes the XFRM CB in the skb.

After previous patch xfrm4_extract_header is only called from
net/xfrm/xfrm_{input,output}.c.

Because of IPV6=m linker errors the ipv6 equivalent
(xfrm6_extract_header) was already placed in xfrm_inout.h because
we can't call functions residing in a module from the core.

So do the same for the ipv4 helper and place it next to the ipv6 one.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  1 -
 net/ipv4/xfrm4_state.c | 21 ---------------------
 net/xfrm/xfrm_inout.h  | 14 ++++++++++++++
 3 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a21c1dea5340..8b956528b6e6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1562,7 +1562,6 @@ int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
 #endif
 
 void xfrm_local_error(struct sk_buff *skb, int mtu);
-int xfrm4_extract_header(struct sk_buff *skb);
 int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 		    int encap_type);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 521fc1bc069c..b23a1711297b 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -8,28 +8,7 @@
  *
  */
 
-#include <net/ip.h>
 #include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/export.h>
-
-int xfrm4_extract_header(struct sk_buff *skb)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-
-	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
-	XFRM_MODE_SKB_CB(skb)->id = iph->id;
-	XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
-	XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
-	XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
-	XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
-	memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
-	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-
-	return 0;
-}
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h
index e24abac92dc2..efc5e6b2e87b 100644
--- a/net/xfrm/xfrm_inout.h
+++ b/net/xfrm/xfrm_inout.h
@@ -6,6 +6,20 @@
 #ifndef XFRM_INOUT_H
 #define XFRM_INOUT_H 1
 
+static inline void xfrm4_extract_header(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+
+	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+	XFRM_MODE_SKB_CB(skb)->id = iph->id;
+	XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
+	XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
+	XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+	XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
+	memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
+	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+}
+
 static inline void xfrm6_extract_header(struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_IPV6)
-- 
cgit v1.2.3


From 3e50ddd8b8d5067796fc87cbbb25c71451ccb385 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:06 +0200
Subject: xfrm: expose local_rxpmtu via ipv6_stubs

We cannot call this function from the core kernel unless we would force
CONFIG_IPV6=y.

Therefore expose this via ipv6_stubs so we can call it from net/xfrm
in the followup patch.

Since the call is expected to be unlikely, no extra code for the IPV6=y
case is added and we will always eat the indirection cost.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ipv6_stubs.h | 1 +
 include/net/xfrm.h       | 1 +
 net/ipv6/af_inet6.c      | 1 +
 net/ipv6/xfrm6_output.c  | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 1e9e0cf7dc75..d8ab3872aa2a 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -57,6 +57,7 @@ struct ipv6_stub {
 			      const struct in6_addr *solicited_addr,
 			      bool router, bool solicited, bool override, bool inc_opt);
 #if IS_ENABLED(CONFIG_XFRM)
+	void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu);
 	int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb);
 	int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi,
 			       int encap_type);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8b956528b6e6..10295ab4cdfb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1608,6 +1608,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
 			  u8 **prevhdr);
 
 #ifdef CONFIG_XFRM
+void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu);
 int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
 int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
 int xfrm_user_policy(struct sock *sk, int optname,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index cbbb00bad20e..aa4882929fd0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -963,6 +963,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 #if IS_ENABLED(CONFIG_XFRM)
+	.xfrm6_local_rxpmtu = xfrm6_local_rxpmtu,
 	.xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
 	.xfrm6_rcv_encap = xfrm6_rcv_encap,
 #endif
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 855078a43fc7..23e2b52cfba6 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -40,7 +40,7 @@ static int xfrm6_local_dontfrag(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
+void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
 {
 	struct flowi6 fl6;
 	struct sock *sk = skb->sk;
-- 
cgit v1.2.3


From ede64dd2bfe2710549f1922a214959d966baaac3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:07 +0200
Subject: xfrm: place xfrm6_local_dontfrag in xfrm.h

so next patch can re-use it from net/xfrm/xfrm_output.c without
causing a linker error when IPV6 is a module.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 16 ++++++++++++++++
 net/ipv6/xfrm6_output.c | 21 ++-------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 10295ab4cdfb..8f7fb033d557 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1993,4 +1993,20 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
 
 	return 0;
 }
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline bool xfrm6_local_dontfrag(const struct sock *sk)
+{
+	int proto;
+
+	if (!sk || sk->sk_family != AF_INET6)
+		return false;
+
+	proto = sk->sk_protocol;
+	if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
+		return inet6_sk(sk)->dontfrag;
+
+	return false;
+}
+#endif
 #endif	/* _NET_XFRM_H */
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 23e2b52cfba6..be64f280510c 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -23,23 +23,6 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(xfrm6_find_1stfragopt);
 
-static int xfrm6_local_dontfrag(struct sk_buff *skb)
-{
-	int proto;
-	struct sock *sk = skb->sk;
-
-	if (sk) {
-		if (sk->sk_family != AF_INET6)
-			return 0;
-
-		proto = sk->sk_protocol;
-		if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
-			return inet6_sk(sk)->dontfrag;
-	}
-
-	return 0;
-}
-
 void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
 {
 	struct flowi6 fl6;
@@ -82,7 +65,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 		skb->dev = dst->dev;
 		skb->protocol = htons(ETH_P_IPV6);
 
-		if (xfrm6_local_dontfrag(skb))
+		if (xfrm6_local_dontfrag(skb->sk))
 			xfrm6_local_rxpmtu(skb, mtu);
 		else if (skb->sk)
 			xfrm_local_error(skb, mtu);
@@ -181,7 +164,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 	toobig = skb->len > mtu && !skb_is_gso(skb);
 
-	if (toobig && xfrm6_local_dontfrag(skb)) {
+	if (toobig && xfrm6_local_dontfrag(skb->sk)) {
 		xfrm6_local_rxpmtu(skb, mtu);
 		kfree_skb(skb);
 		return -EMSGSIZE;
-- 
cgit v1.2.3


From f3075f48ddb2c4d076aeda36fa0939163e4b2816 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:08 +0200
Subject: xfrm: remove extract_output indirection from xfrm_state_afinfo

Move this to xfrm_output.c.  This avoids the state->extract_output
indirection.

This patch also removes the duplicated __xfrm6_extract_header helper
added in an earlier patch, we can now use the one from xfrm_inout.h .

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  3 ---
 net/ipv6/xfrm6_output.c | 58 ------------------------------------------
 net/ipv6/xfrm6_state.c  |  1 -
 net/xfrm/xfrm_output.c  | 67 +++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 59 insertions(+), 70 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8f7fb033d557..db814a7e042f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -362,8 +362,6 @@ struct xfrm_state_afinfo {
 
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int			(*output_finish)(struct sock *sk, struct sk_buff *skb);
-	int			(*extract_output)(struct xfrm_state *x,
-						  struct sk_buff *skb);
 	int			(*transport_finish)(struct sk_buff *skb,
 						    int async);
 	void			(*local_error)(struct sk_buff *skb, u32 mtu);
@@ -1601,7 +1599,6 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
 int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
 __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
 __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
-int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index be64f280510c..b7d65b344679 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -47,64 +47,6 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
 	ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
 }
 
-static int xfrm6_tunnel_check_size(struct sk_buff *skb)
-{
-	int mtu, ret = 0;
-	struct dst_entry *dst = skb_dst(skb);
-
-	if (skb->ignore_df)
-		goto out;
-
-	mtu = dst_mtu(dst);
-	if (mtu < IPV6_MIN_MTU)
-		mtu = IPV6_MIN_MTU;
-
-	if ((!skb_is_gso(skb) && skb->len > mtu) ||
-	    (skb_is_gso(skb) &&
-	     !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) {
-		skb->dev = dst->dev;
-		skb->protocol = htons(ETH_P_IPV6);
-
-		if (xfrm6_local_dontfrag(skb->sk))
-			xfrm6_local_rxpmtu(skb, mtu);
-		else if (skb->sk)
-			xfrm_local_error(skb, mtu);
-		else
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		ret = -EMSGSIZE;
-	}
-out:
-	return ret;
-}
-
-static void __xfrm6_extract_header(struct sk_buff *skb)
-{
-	struct ipv6hdr *iph = ipv6_hdr(skb);
-
-	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
-	XFRM_MODE_SKB_CB(skb)->id = 0;
-	XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
-	XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
-	XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
-	XFRM_MODE_SKB_CB(skb)->optlen = 0;
-	memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
-	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-}
-
-int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err;
-
-	err = xfrm6_tunnel_check_size(skb);
-	if (err)
-		return err;
-
-	XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
-
-	__xfrm6_extract_header(skb);
-	return 0;
-}
-
 int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
 {
 	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 8fbf5a68ee6e..15247f2f78e1 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -19,7 +19,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.proto			= IPPROTO_IPV6,
 	.output			= xfrm6_output,
 	.output_finish		= xfrm6_output_finish,
-	.extract_output		= xfrm6_extract_output,
 	.transport_finish	= xfrm6_transport_finish,
 	.local_error		= xfrm6_local_error,
 };
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a7b3af7f7a1e..3a646df1318d 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -17,6 +17,11 @@
 #include <net/inet_ecn.h>
 #include <net/xfrm.h>
 
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_route.h>
+#include <net/ipv6_stubs.h>
+#endif
+
 #include "xfrm_inout.h"
 
 static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb);
@@ -651,11 +656,60 @@ static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static int xfrm6_tunnel_check_size(struct sk_buff *skb)
+{
+	int mtu, ret = 0;
+	struct dst_entry *dst = skb_dst(skb);
+
+	if (skb->ignore_df)
+		goto out;
+
+	mtu = dst_mtu(dst);
+	if (mtu < IPV6_MIN_MTU)
+		mtu = IPV6_MIN_MTU;
+
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) &&
+	     !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) {
+		skb->dev = dst->dev;
+		skb->protocol = htons(ETH_P_IPV6);
+
+		if (xfrm6_local_dontfrag(skb->sk))
+			ipv6_stub->xfrm6_local_rxpmtu(skb, mtu);
+		else if (skb->sk)
+			xfrm_local_error(skb, mtu);
+		else
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		ret = -EMSGSIZE;
+	}
+out:
+	return ret;
+}
+#endif
+
+static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	int err;
+
+	err = xfrm6_tunnel_check_size(skb);
+	if (err)
+		return err;
+
+	XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
+
+	xfrm6_extract_header(skb);
+	return 0;
+#else
+	WARN_ON_ONCE(1);
+	return -EAFNOSUPPORT;
+#endif
+}
+
 static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	const struct xfrm_state_afinfo *afinfo;
 	const struct xfrm_mode *inner_mode;
-	int err = -EAFNOSUPPORT;
 
 	if (x->sel.family == AF_UNSPEC)
 		inner_mode = xfrm_ip2inner_mode(x,
@@ -669,14 +723,11 @@ static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
 	switch (inner_mode->family) {
 	case AF_INET:
 		return xfrm4_extract_output(x, skb);
+	case AF_INET6:
+		return xfrm6_extract_output(x, skb);
 	}
-	rcu_read_lock();
-	afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
-	if (likely(afinfo))
-		err = afinfo->extract_output(x, skb);
-	rcu_read_unlock();
 
-	return err;
+	return -EAFNOSUPPORT;
 }
 
 void xfrm_local_error(struct sk_buff *skb, int mtu)
-- 
cgit v1.2.3


From 2ab6096db2f16b3a6adbad252f1be171e649028d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 4 May 2020 10:06:09 +0200
Subject: xfrm: remove output_finish indirection from xfrm_state_afinfo

There are only two implementaions, one for ipv4 and one for ipv6.

Both are almost identical, they clear skb->cb[], set the TRANSFORMED flag
in IP(6)CB and then call the common xfrm_output() function.

By placing the IPCB handling into the common function, we avoid the need
for the output_finish indirection as the output functions can simply
use xfrm_output().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  1 -
 net/ipv4/xfrm4_output.c | 23 +----------------------
 net/ipv4/xfrm4_state.c  |  1 -
 net/ipv6/xfrm6_output.c | 34 ++--------------------------------
 net/ipv6/xfrm6_state.c  |  1 -
 net/xfrm/xfrm_output.c  | 16 ++++++++++++++++
 6 files changed, 19 insertions(+), 57 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index db814a7e042f..094fe682f5d7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -361,7 +361,6 @@ struct xfrm_state_afinfo {
 	const struct xfrm_type		*type_dstopts;
 
 	int			(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
-	int			(*output_finish)(struct sock *sk, struct sk_buff *skb);
 	int			(*transport_finish)(struct sk_buff *skb,
 						    int async);
 	void			(*local_error)(struct sk_buff *skb, u32 mtu);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 21c8fa0a31ed..502eb189d852 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -14,22 +14,9 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
-{
-	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-
-#ifdef CONFIG_NETFILTER
-	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
-#endif
-
-	return xfrm_output(sk, skb);
-}
-
 static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
-	const struct xfrm_state_afinfo *afinfo;
-	int ret = -EAFNOSUPPORT;
 
 #ifdef CONFIG_NETFILTER
 	if (!x) {
@@ -38,15 +25,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	}
 #endif
 
-	rcu_read_lock();
-	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
-	if (likely(afinfo))
-		ret = afinfo->output_finish(sk, skb);
-	else
-		kfree_skb(skb);
-	rcu_read_unlock();
-
-	return ret;
+	return xfrm_output(sk, skb);
 }
 
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index b23a1711297b..87d4db591488 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -14,7 +14,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
 	.proto			= IPPROTO_IPIP,
 	.output			= xfrm4_output,
-	.output_finish		= xfrm4_output_finish,
 	.transport_finish	= xfrm4_transport_finish,
 	.local_error		= xfrm4_local_error,
 };
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index b7d65b344679..8b84d534b19d 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -47,39 +47,9 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
 	ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
 }
 
-int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
-{
-	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
-
-#ifdef CONFIG_NETFILTER
-	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
-#endif
-
-	return xfrm_output(sk, skb);
-}
-
-static int __xfrm6_output_state_finish(struct xfrm_state *x, struct sock *sk,
-				       struct sk_buff *skb)
-{
-	const struct xfrm_state_afinfo *afinfo;
-	int ret = -EAFNOSUPPORT;
-
-	rcu_read_lock();
-	afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
-	if (likely(afinfo))
-		ret = afinfo->output_finish(sk, skb);
-	else
-		kfree_skb(skb);
-	rcu_read_unlock();
-
-	return ret;
-}
-
 static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct xfrm_state *x = skb_dst(skb)->xfrm;
-
-	return __xfrm6_output_state_finish(x, sk, skb);
+	return xfrm_output(sk, skb);
 }
 
 static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -121,7 +91,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 				    __xfrm6_output_finish);
 
 skip_frag:
-	return __xfrm6_output_state_finish(x, sk, skb);
+	return xfrm_output(sk, skb);
 }
 
 int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 15247f2f78e1..6610b2198fa9 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -18,7 +18,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
 	.proto			= IPPROTO_IPV6,
 	.output			= xfrm6_output,
-	.output_finish		= xfrm6_output_finish,
 	.transport_finish	= xfrm6_transport_finish,
 	.local_error		= xfrm6_local_error,
 };
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 3a646df1318d..9c43b8dd80fb 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -571,6 +571,22 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
 	int err;
 
+	switch (x->outer_mode.family) {
+	case AF_INET:
+		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+#ifdef CONFIG_NETFILTER
+		IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+		break;
+	case AF_INET6:
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+
+#ifdef CONFIG_NETFILTER
+		IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
+#endif
+		break;
+	}
+
 	secpath_reset(skb);
 
 	if (xfrm_dev_offload_ok(skb, x)) {
-- 
cgit v1.2.3


From 8741e18419bf4b9e3b02c0ed01ea13f1aa497fa7 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Wed, 6 May 2020 14:16:16 +0800
Subject: net: bridge: return false in br_mrp_enabled()

Fix the following coccicheck warning:

net/bridge/br_private.h:1334:8-9: WARNING: return of 0/1 in function
'br_mrp_enabled' with return type bool

Fixes: 6536993371fab ("bridge: mrp: Integrate MRP into the bridge")
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c35647cb138a..78d3a951180d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1331,7 +1331,7 @@ static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
 
 static inline bool br_mrp_enabled(struct net_bridge *br)
 {
-	return 0;
+	return false;
 }
 
 static inline void br_mrp_port_del(struct net_bridge *br,
-- 
cgit v1.2.3


From 969c54646af0d7d94a5f0f37adbbfe024e85466e Mon Sep 17 00:00:00 2001
From: Fernando Gont <fgont@si6networks.com>
Date: Fri, 1 May 2020 00:51:47 -0300
Subject: ipv6: Implement draft-ietf-6man-rfc4941bis

Implement the upcoming rev of RFC4941 (IPv6 temporary addresses):
https://tools.ietf.org/html/draft-ietf-6man-rfc4941bis-09

* Reduces the default Valid Lifetime to 2 days
  The number of extra addresses employed when Valid Lifetime was
  7 days exacerbated the stress caused on network
  elements/devices. Additionally, the motivation for temporary
  addresses is indeed privacy and reduced exposure. With a
  default Valid Lifetime of 7 days, an address that becomes
  revealed by active communication is reachable and exposed for
  one whole week. The only use case for a Valid Lifetime of 7
  days could be some application that is expecting to have long
  lived connections. But if you want to have a long lived
  connections, you shouldn't be using a temporary address in the
  first place. Additionally, in the era of mobile devices, general
  applications should nevertheless be prepared and robust to
  address changes (e.g. nodes swap wifi <-> 4G, etc.)

* Employs different IIDs for different prefixes
  To avoid network activity correlation among addresses configured
  for different prefixes

* Uses a simpler algorithm for IID generation
  No need to store "history" anywhere

Signed-off-by: Fernando Gont <fgont@si6networks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst |  2 +-
 include/net/if_inet6.h                 |  1 -
 net/ipv6/addrconf.c                    | 91 +++++++++++++++-------------------
 3 files changed, 40 insertions(+), 54 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 50b440d29a13..b72f89d5694c 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2065,7 +2065,7 @@ use_tempaddr - INTEGER
 temp_valid_lft - INTEGER
 	valid lifetime (in seconds) for temporary addresses.
 
-	Default: 604800 (7 days)
+	Default: 172800 (2 days)
 
 temp_prefered_lft - INTEGER
 	Preferred lifetime (in seconds) for temporary addresses.
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index a01981d7108f..212eb278bda6 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -190,7 +190,6 @@ struct inet6_dev {
 	int			dead;
 
 	u32			desync_factor;
-	u8			rndid[8];
 	struct list_head	tempaddr_list;
 
 	struct in6_addr		token;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 26e666fe9a0e..fd885f06c4ed 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -135,8 +135,7 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
 }
 #endif
 
-static void ipv6_regen_rndid(struct inet6_dev *idev);
-static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
+static void ipv6_gen_rnd_iid(struct in6_addr *addr);
 
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
 static int ipv6_count_addresses(const struct inet6_dev *idev);
@@ -432,8 +431,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	    dev->type == ARPHRD_SIT ||
 	    dev->type == ARPHRD_NONE) {
 		ndev->cnf.use_tempaddr = -1;
-	} else
-		ipv6_regen_rndid(ndev);
+	}
 
 	ndev->token = in6addr_any;
 
@@ -1306,29 +1304,21 @@ out:
 	in6_ifa_put(ifp);
 }
 
-static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
-				struct inet6_ifaddr *ift,
-				bool block)
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
 {
 	struct inet6_dev *idev = ifp->idev;
-	struct in6_addr addr, *tmpaddr;
 	unsigned long tmp_tstamp, age;
 	unsigned long regen_advance;
-	struct ifa6_config cfg;
-	int ret = 0;
 	unsigned long now = jiffies;
-	long max_desync_factor;
 	s32 cnf_temp_preferred_lft;
+	struct inet6_ifaddr *ift;
+	struct ifa6_config cfg;
+	long max_desync_factor;
+	struct in6_addr addr;
+	int ret = 0;
 
 	write_lock_bh(&idev->lock);
-	if (ift) {
-		spin_lock_bh(&ift->lock);
-		memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
-		spin_unlock_bh(&ift->lock);
-		tmpaddr = &addr;
-	} else {
-		tmpaddr = NULL;
-	}
+
 retry:
 	in6_dev_hold(idev);
 	if (idev->cnf.use_tempaddr <= 0) {
@@ -1351,8 +1341,8 @@ retry:
 	}
 	in6_ifa_hold(ifp);
 	memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
-	ipv6_try_regen_rndid(idev, tmpaddr);
-	memcpy(&addr.s6_addr[8], idev->rndid, 8);
+	ipv6_gen_rnd_iid(&addr);
+
 	age = (now - ifp->tstamp) / HZ;
 
 	regen_advance = idev->cnf.regen_max_retry *
@@ -1417,7 +1407,6 @@ retry:
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
 		pr_info("%s: retry temporary address regeneration\n", __func__);
-		tmpaddr = &addr;
 		write_lock_bh(&idev->lock);
 		goto retry;
 	}
@@ -2032,7 +2021,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
 		if (ifpub) {
 			in6_ifa_hold(ifpub);
 			spin_unlock_bh(&ifp->lock);
-			ipv6_create_tempaddr(ifpub, ifp, true);
+			ipv6_create_tempaddr(ifpub, true);
 			in6_ifa_put(ifpub);
 		} else {
 			spin_unlock_bh(&ifp->lock);
@@ -2329,40 +2318,38 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
 	return err;
 }
 
-/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static void ipv6_regen_rndid(struct inet6_dev *idev)
+/* Generation of a randomized Interface Identifier
+ * draft-ietf-6man-rfc4941bis, Section 3.3.1
+ */
+
+static void ipv6_gen_rnd_iid(struct in6_addr *addr)
 {
 regen:
-	get_random_bytes(idev->rndid, sizeof(idev->rndid));
-	idev->rndid[0] &= ~0x02;
+	get_random_bytes(&addr->s6_addr[8], 8);
 
-	/*
-	 * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
-	 * check if generated address is not inappropriate
+	/* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
+	 * check if generated address is not inappropriate:
 	 *
-	 *  - Reserved subnet anycast (RFC 2526)
-	 *	11111101 11....11 1xxxxxxx
-	 *  - ISATAP (RFC4214) 6.1
-	 *	00-00-5E-FE-xx-xx-xx-xx
-	 *  - value 0
-	 *  - XXX: already assigned to an address on the device
+	 * - Reserved IPv6 Interface Identifers
+	 * - XXX: already assigned to an address on the device
 	 */
-	if (idev->rndid[0] == 0xfd &&
-	    (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
-	    (idev->rndid[7]&0x80))
+
+	/* Subnet-router anycast: 0000:0000:0000:0000 */
+	if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
 		goto regen;
-	if ((idev->rndid[0]|idev->rndid[1]) == 0) {
-		if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
-			goto regen;
-		if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
-			goto regen;
-	}
-}
 
-static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
-{
-	if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
-		ipv6_regen_rndid(idev);
+	/* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
+	 * Proxy Mobile IPv6:   0200:5EFF:FE00:5213
+	 * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
+	 */
+	if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
+	    (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
+		goto regen;
+
+	/* Reserved subnet anycast addresses */
+	if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
+	    ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
+		goto regen;
 }
 
 /*
@@ -2544,7 +2531,7 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 		 * no temporary address currently exists.
 		 */
 		read_unlock_bh(&idev->lock);
-		ipv6_create_tempaddr(ifp, NULL, false);
+		ipv6_create_tempaddr(ifp, false);
 	} else {
 		read_unlock_bh(&idev->lock);
 	}
@@ -4531,7 +4518,7 @@ restart:
 						ifpub->regen_count = 0;
 						spin_unlock(&ifpub->lock);
 						rcu_read_unlock_bh();
-						ipv6_create_tempaddr(ifpub, ifp, true);
+						ipv6_create_tempaddr(ifpub, true);
 						in6_ifa_put(ifpub);
 						in6_ifa_put(ifp);
 						rcu_read_lock_bh();
-- 
cgit v1.2.3


From 8dc242ad661c2694a582541c2264ffc0e7c4d27d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 May 2020 11:27:49 -0700
Subject: tcp: refine tcp_pacing_delay() for very low pacing rates

With the addition of horizon feature to sch_fq, we noticed some
suboptimal behavior of extremely low pacing rate TCP flows, especially
when TCP is not aware of a drop happening in lower stacks.

Back in commit 3f80e08f40cd ("tcp: add tcp_reset_xmit_timer() helper"),
tcp_pacing_delay() was added to estimate an extra delay to add to standard
rto timers.

This patch removes the skb argument from this helper and
tcp_reset_xmit_timer() because it makes more sense to simply
consider the time at which next packet is allowed to be sent,
instead of the time of whatever packet has been sent.

This avoids arming RTO timer too soon and removes
spurious horizon drops.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 21 ++++++++-------------
 net/ipv4/tcp_input.c  |  4 ++--
 net/ipv4/tcp_output.c |  8 +++-----
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1beed50522b1..43b87a8d4790 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1289,26 +1289,22 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk)
 	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
 }
 
-/* Return in jiffies the delay before one skb is sent.
- * If @skb is NULL, we look at EDT for next packet being sent on the socket.
+/* Estimates in how many jiffies next packet for this flow can be sent.
+ * Scheduling a retransmit timer too early would be silly.
  */
-static inline unsigned long tcp_pacing_delay(const struct sock *sk,
-					     const struct sk_buff *skb)
+static inline unsigned long tcp_pacing_delay(const struct sock *sk)
 {
-	s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns;
+	s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
 
-	pacing_delay -= tcp_sk(sk)->tcp_clock_cache;
-
-	return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0;
+	return delay > 0 ? nsecs_to_jiffies(delay) : 0;
 }
 
 static inline void tcp_reset_xmit_timer(struct sock *sk,
 					const int what,
 					unsigned long when,
-					const unsigned long max_when,
-					const struct sk_buff *skb)
+					const unsigned long max_when)
 {
-	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb),
+	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
 				  max_when);
 }
 
@@ -1336,8 +1332,7 @@ static inline void tcp_check_probe_timer(struct sock *sk)
 {
 	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
 		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-				     tcp_probe0_base(sk), TCP_RTO_MAX,
-				     NULL);
+				     tcp_probe0_base(sk), TCP_RTO_MAX);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d68128a672ab..7d205b2a733c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3014,7 +3014,7 @@ void tcp_rearm_rto(struct sock *sk)
 			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+				     TCP_RTO_MAX);
 	}
 }
 
@@ -3291,7 +3291,7 @@ static void tcp_ack_probe(struct sock *sk)
 		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
 
 		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-				     when, TCP_RTO_MAX, NULL);
+				     when, TCP_RTO_MAX);
 	}
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c414aeb1efa9..32c9db902f18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2593,8 +2593,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 	if (rto_delta_us > 0)
 		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
 
-	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
-			     TCP_RTO_MAX, NULL);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
 	return true;
 }
 
@@ -3174,8 +3173,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
 			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 					     inet_csk(sk)->icsk_rto,
-					     TCP_RTO_MAX,
-					     skb);
+					     TCP_RTO_MAX);
 	}
 }
 
@@ -3907,7 +3905,7 @@ void tcp_send_probe0(struct sock *sk)
 		 */
 		timeout = TCP_RESOURCE_PROBE_INTERVAL;
 	}
-	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
 }
 
 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
-- 
cgit v1.2.3


From 916e6d1a5ef17a6b3bffad0f086f173cde4240d8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 May 2020 11:27:50 -0700
Subject: tcp: defer xmit timer reset in tcp_xmit_retransmit_queue()

As hinted in prior change ("tcp: refine tcp_pacing_delay()
for very low pacing rates"), it is probably best arming
the xmit timer only when all the packets have been scheduled,
rather than when the head of rtx queue has been re-sent.

This does matter for flows having extremely low pacing rates,
since their tp->tcp_wstamp_ns could be far in the future.

Note that the regular xmit path has a stronger limit
in tcp_small_queue_check(), meaning it is less likely to
go beyond the pacing horizon.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 32c9db902f18..a50e1990a845 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3112,6 +3112,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb, *rtx_head, *hole = NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool rearm_timer = false;
 	u32 max_segs;
 	int mib_idx;
 
@@ -3134,7 +3135,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 		segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
 		if (segs <= 0)
-			return;
+			break;
 		sacked = TCP_SKB_CB(skb)->sacked;
 		/* In case tcp_shift_skb_data() have aggregated large skbs,
 		 * we need to make sure not sending too bigs TSO packets
@@ -3159,10 +3160,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 			continue;
 
 		if (tcp_small_queue_check(sk, skb, 1))
-			return;
+			break;
 
 		if (tcp_retransmit_skb(sk, skb, segs))
-			return;
+			break;
 
 		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
 
@@ -3171,10 +3172,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 		if (skb == rtx_head &&
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
-			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					     inet_csk(sk)->icsk_rto,
-					     TCP_RTO_MAX);
+			rearm_timer = true;
+
 	}
+	if (rearm_timer)
+		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				     inet_csk(sk)->icsk_rto,
+				     TCP_RTO_MAX);
 }
 
 /* We allow to exceed memory limits for FIN packets to expedite
-- 
cgit v1.2.3


From bdbdac7649fac05f88c9f7ab18121a17fb591687 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 5 May 2020 08:35:05 +0200
Subject: ethtool: provide UAPI for PHY master/slave configuration.

This UAPI is needed for BroadR-Reach 100BASE-T1 devices. Due to lack of
auto-negotiation support, we needed to be able to configure the
MASTER-SLAVE role of the port manually or from an application in user
space.

The same UAPI can be used for 1000BASE-T or MultiGBASE-T devices to
force MASTER or SLAVE role. See IEEE 802.3-2018:
22.2.4.3.7 MASTER-SLAVE control register (Register 9)
22.2.4.3.8 MASTER-SLAVE status register (Register 10)
40.5.2 MASTER-SLAVE configuration resolution
45.2.1.185.1 MASTER-SLAVE config value (1.2100.14)
45.2.7.10 MultiGBASE-T AN control 1 register (Register 7.32)

The MASTER-SLAVE role affects the clock configuration:

-------------------------------------------------------------------------------
When the  PHY is configured as MASTER, the PMA Transmit function shall
source TX_TCLK from a local clock source. When configured as SLAVE, the
PMA Transmit function shall source TX_TCLK from the clock recovered from
data stream provided by MASTER.

iMX6Q                     KSZ9031                XXX
------\                /-----------\        /------------\
      |                |           |        |            |
 MAC  |<----RGMII----->| PHY Slave |<------>| PHY Master |
      |<--- 125 MHz ---+-<------/  |        | \          |
------/                \-----------/        \------------/
                                               ^
                                                \-TX_TCLK

-------------------------------------------------------------------------------

Since some clock or link related issues are only reproducible in a
specific MASTER-SLAVE-role, MAC and PHY configuration, it is beneficial
to provide generic (not 100BASE-T1 specific) interface to the user space
for configuration flexibility and trouble shooting.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 35 ++++++-----
 drivers/net/phy/phy.c                        |  4 +-
 drivers/net/phy/phy_device.c                 | 94 ++++++++++++++++++++++++++++
 include/linux/phy.h                          |  3 +
 include/uapi/linux/ethtool.h                 | 16 ++++-
 include/uapi/linux/ethtool_netlink.h         |  2 +
 include/uapi/linux/mii.h                     |  2 +
 net/ethtool/ioctl.c                          |  6 ++
 net/ethtool/linkmodes.c                      | 53 ++++++++++++++++
 9 files changed, 197 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 567326491f80..8f5cefc539cf 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -392,14 +392,16 @@ Request contents:
 
 Kernel response contents:
 
-  ====================================  ======  ==========================
-  ``ETHTOOL_A_LINKMODES_HEADER``        nested  reply header
-  ``ETHTOOL_A_LINKMODES_AUTONEG``       u8      autonegotiation status
-  ``ETHTOOL_A_LINKMODES_OURS``          bitset  advertised link modes
-  ``ETHTOOL_A_LINKMODES_PEER``          bitset  partner link modes
-  ``ETHTOOL_A_LINKMODES_SPEED``         u32     link speed (Mb/s)
-  ``ETHTOOL_A_LINKMODES_DUPLEX``        u8      duplex mode
-  ====================================  ======  ==========================
+  ==========================================  ======  ==========================
+  ``ETHTOOL_A_LINKMODES_HEADER``              nested  reply header
+  ``ETHTOOL_A_LINKMODES_AUTONEG``             u8      autonegotiation status
+  ``ETHTOOL_A_LINKMODES_OURS``                bitset  advertised link modes
+  ``ETHTOOL_A_LINKMODES_PEER``                bitset  partner link modes
+  ``ETHTOOL_A_LINKMODES_SPEED``               u32     link speed (Mb/s)
+  ``ETHTOOL_A_LINKMODES_DUPLEX``              u8      duplex mode
+  ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG``    u8      Master/slave port mode
+  ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE``  u8      Master/slave port state
+  ==========================================  ======  ==========================
 
 For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask
 represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit
@@ -414,14 +416,15 @@ LINKMODES_SET
 
 Request contents:
 
-  ====================================  ======  ==========================
-  ``ETHTOOL_A_LINKMODES_HEADER``        nested  request header
-  ``ETHTOOL_A_LINKMODES_AUTONEG``       u8      autonegotiation status
-  ``ETHTOOL_A_LINKMODES_OURS``          bitset  advertised link modes
-  ``ETHTOOL_A_LINKMODES_PEER``          bitset  partner link modes
-  ``ETHTOOL_A_LINKMODES_SPEED``         u32     link speed (Mb/s)
-  ``ETHTOOL_A_LINKMODES_DUPLEX``        u8      duplex mode
-  ====================================  ======  ==========================
+  ==========================================  ======  ==========================
+  ``ETHTOOL_A_LINKMODES_HEADER``              nested  request header
+  ``ETHTOOL_A_LINKMODES_AUTONEG``             u8      autonegotiation status
+  ``ETHTOOL_A_LINKMODES_OURS``                bitset  advertised link modes
+  ``ETHTOOL_A_LINKMODES_PEER``                bitset  partner link modes
+  ``ETHTOOL_A_LINKMODES_SPEED``               u32     link speed (Mb/s)
+  ``ETHTOOL_A_LINKMODES_DUPLEX``              u8      duplex mode
+  ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG``    u8      Master/slave port mode
+  ==========================================  ======  ==========================
 
 ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If
 autonegotiation is on (either set now or kept from before), advertised modes
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 72c69a9c8a98..8c22d02b4218 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -295,7 +295,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev,
 			 phydev->advertising, autoneg == AUTONEG_ENABLE);
 
 	phydev->duplex = duplex;
-
+	phydev->master_slave_set = cmd->base.master_slave_cfg;
 	phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl;
 
 	/* Restart the PHY */
@@ -314,6 +314,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev,
 
 	cmd->base.speed = phydev->speed;
 	cmd->base.duplex = phydev->duplex;
+	cmd->base.master_slave_cfg = phydev->master_slave_get;
+	cmd->base.master_slave_state = phydev->master_slave_state;
 	if (phydev->interface == PHY_INTERFACE_MODE_MOCA)
 		cmd->base.port = PORT_BNC;
 	else
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index b1c5e4503bc4..83fc8e1b5793 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1913,6 +1913,90 @@ int genphy_setup_forced(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_setup_forced);
 
+static int genphy_setup_master_slave(struct phy_device *phydev)
+{
+	u16 ctl = 0;
+
+	if (!phydev->is_gigabit_capable)
+		return 0;
+
+	switch (phydev->master_slave_set) {
+	case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+		ctl |= CTL1000_PREFER_MASTER;
+		break;
+	case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+		break;
+	case MASTER_SLAVE_CFG_MASTER_FORCE:
+		ctl |= CTL1000_AS_MASTER;
+		/* fallthrough */
+	case MASTER_SLAVE_CFG_SLAVE_FORCE:
+		ctl |= CTL1000_ENABLE_MASTER;
+		break;
+	case MASTER_SLAVE_CFG_UNKNOWN:
+	case MASTER_SLAVE_CFG_UNSUPPORTED:
+		return 0;
+	default:
+		phydev_warn(phydev, "Unsupported Master/Slave mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	return phy_modify_changed(phydev, MII_CTRL1000,
+				  (CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER |
+				   CTL1000_PREFER_MASTER), ctl);
+}
+
+static int genphy_read_master_slave(struct phy_device *phydev)
+{
+	int cfg, state;
+	u16 val;
+
+	if (!phydev->is_gigabit_capable) {
+		phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED;
+		phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
+		return 0;
+	}
+
+	phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN;
+	phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN;
+
+	val = phy_read(phydev, MII_CTRL1000);
+	if (val < 0)
+		return val;
+
+	if (val & CTL1000_ENABLE_MASTER) {
+		if (val & CTL1000_AS_MASTER)
+			cfg = MASTER_SLAVE_CFG_MASTER_FORCE;
+		else
+			cfg = MASTER_SLAVE_CFG_SLAVE_FORCE;
+	} else {
+		if (val & CTL1000_PREFER_MASTER)
+			cfg = MASTER_SLAVE_CFG_MASTER_PREFERRED;
+		else
+			cfg = MASTER_SLAVE_CFG_SLAVE_PREFERRED;
+	}
+
+	val = phy_read(phydev, MII_STAT1000);
+	if (val < 0)
+		return val;
+
+	if (val & LPA_1000MSFAIL) {
+		state = MASTER_SLAVE_STATE_ERR;
+	} else if (phydev->link) {
+		/* this bits are valid only for active link */
+		if (val & LPA_1000MSRES)
+			state = MASTER_SLAVE_STATE_MASTER;
+		else
+			state = MASTER_SLAVE_STATE_SLAVE;
+	} else {
+		state = MASTER_SLAVE_STATE_UNKNOWN;
+	}
+
+	phydev->master_slave_get = cfg;
+	phydev->master_slave_state = state;
+
+	return 0;
+}
+
 /**
  * genphy_restart_aneg - Enable and Restart Autonegotiation
  * @phydev: target phy_device struct
@@ -1971,6 +2055,12 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed)
 	if (genphy_config_eee_advert(phydev))
 		changed = true;
 
+	err = genphy_setup_master_slave(phydev);
+	if (err < 0)
+		return err;
+	else if (err)
+		changed = true;
+
 	if (AUTONEG_ENABLE != phydev->autoneg)
 		return genphy_setup_forced(phydev);
 
@@ -2205,6 +2295,10 @@ int genphy_read_status(struct phy_device *phydev)
 	phydev->pause = 0;
 	phydev->asym_pause = 0;
 
+	err = genphy_read_master_slave(phydev);
+	if (err < 0)
+		return err;
+
 	err = genphy_read_lpa(phydev);
 	if (err < 0)
 		return err;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1d36ac608159..a2b91b5f9d0a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -477,6 +477,9 @@ struct phy_device {
 	int duplex;
 	int pause;
 	int asym_pause;
+	u8 master_slave_get;
+	u8 master_slave_set;
+	u8 master_slave_state;
 
 	/* Union of PHY and Attached devices' supported link modes */
 	/* See ethtool.h for more info */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 92f737f10117..f4662b3a9e1e 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1666,6 +1666,18 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 	return 0;
 }
 
+#define MASTER_SLAVE_CFG_UNSUPPORTED		0
+#define MASTER_SLAVE_CFG_UNKNOWN		1
+#define MASTER_SLAVE_CFG_MASTER_PREFERRED	2
+#define MASTER_SLAVE_CFG_SLAVE_PREFERRED	3
+#define MASTER_SLAVE_CFG_MASTER_FORCE		4
+#define MASTER_SLAVE_CFG_SLAVE_FORCE		5
+#define MASTER_SLAVE_STATE_UNSUPPORTED		0
+#define MASTER_SLAVE_STATE_UNKNOWN		1
+#define MASTER_SLAVE_STATE_MASTER		2
+#define MASTER_SLAVE_STATE_SLAVE		3
+#define MASTER_SLAVE_STATE_ERR			4
+
 /* Which connector port. */
 #define PORT_TP			0x00
 #define PORT_AUI		0x01
@@ -1904,7 +1916,9 @@ struct ethtool_link_settings {
 	__u8	eth_tp_mdix_ctrl;
 	__s8	link_mode_masks_nwords;
 	__u8	transceiver;
-	__u8	reserved1[3];
+	__u8	master_slave_cfg;
+	__u8	master_slave_state;
+	__u8	reserved1[1];
 	__u32	reserved[7];
 	__u32	link_mode_masks[0];
 	/* layout of link_mode_masks fields:
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 7fde76366ba4..bf1d310e20bc 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -216,6 +216,8 @@ enum {
 	ETHTOOL_A_LINKMODES_PEER,		/* bitset */
 	ETHTOOL_A_LINKMODES_SPEED,		/* u32 */
 	ETHTOOL_A_LINKMODES_DUPLEX,		/* u8 */
+	ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,	/* u8 */
+	ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,	/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_LINKMODES_CNT,
diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index 90f9b4e1ba27..39f7c44baf53 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -151,11 +151,13 @@
 /* 1000BASE-T Control register */
 #define ADVERTISE_1000FULL	0x0200  /* Advertise 1000BASE-T full duplex */
 #define ADVERTISE_1000HALF	0x0100  /* Advertise 1000BASE-T half duplex */
+#define CTL1000_PREFER_MASTER	0x0400  /* prefer to operate as master */
 #define CTL1000_AS_MASTER	0x0800
 #define CTL1000_ENABLE_MASTER	0x1000
 
 /* 1000BASE-T Status register */
 #define LPA_1000MSFAIL		0x8000	/* Master/Slave resolution failure */
+#define LPA_1000MSRES		0x4000	/* Master/Slave resolution status */
 #define LPA_1000LOCALRXOK	0x2000	/* Link partner local receiver status */
 #define LPA_1000REMRXOK		0x1000	/* Link partner remote receiver status */
 #define LPA_1000FULL		0x0800	/* Link partner 1000BASE-T full duplex */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 226d5ecdd567..52102ab1709b 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -552,6 +552,8 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
 	link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
 	link_ksettings.base.link_mode_masks_nwords
 		= __ETHTOOL_LINK_MODE_MASK_NU32;
+	link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED;
+	link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
 
 	return store_link_ksettings_for_user(useraddr, &link_ksettings);
 }
@@ -589,6 +591,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	    != link_ksettings.base.link_mode_masks_nwords)
 		return -EINVAL;
 
+	if (link_ksettings.base.master_slave_cfg ||
+	    link_ksettings.base.master_slave_state)
+		return -EINVAL;
+
 	err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 	if (err >= 0) {
 		ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 452608c6d856..fd4f3e58c6f6 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -27,6 +27,8 @@ linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
 	[ETHTOOL_A_LINKMODES_PEER]		= { .type = NLA_REJECT },
 	[ETHTOOL_A_LINKMODES_SPEED]		= { .type = NLA_REJECT },
 	[ETHTOOL_A_LINKMODES_DUPLEX]		= { .type = NLA_REJECT },
+	[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]	= { .type = NLA_REJECT },
+	[ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE]	= { .type = NLA_REJECT },
 };
 
 static int linkmodes_prepare_data(const struct ethnl_req_info *req_base,
@@ -63,6 +65,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
 {
 	const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
 	const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+	const struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
 	int len, ret;
 
@@ -86,6 +89,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
 		len += ret;
 	}
 
+	if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED)
+		len += nla_total_size(sizeof(u8));
+
+	if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED)
+		len += nla_total_size(sizeof(u8));
+
 	return len;
 }
 
@@ -122,6 +131,16 @@ static int linkmodes_fill_reply(struct sk_buff *skb,
 	    nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex))
 		return -EMSGSIZE;
 
+	if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED &&
+	    nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,
+		       lsettings->master_slave_cfg))
+		return -EMSGSIZE;
+
+	if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED &&
+	    nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,
+		       lsettings->master_slave_state))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
@@ -249,6 +268,8 @@ linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
 	[ETHTOOL_A_LINKMODES_PEER]		= { .type = NLA_REJECT },
 	[ETHTOOL_A_LINKMODES_SPEED]		= { .type = NLA_U32 },
 	[ETHTOOL_A_LINKMODES_DUPLEX]		= { .type = NLA_U8 },
+	[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]	= { .type = NLA_U8 },
+	[ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE]	= { .type = NLA_REJECT },
 };
 
 /* Set advertised link modes to all supported modes matching requested speed
@@ -287,14 +308,45 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
 			     __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static bool ethnl_validate_master_slave_cfg(u8 cfg)
+{
+	switch (cfg) {
+	case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+	case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+	case MASTER_SLAVE_CFG_MASTER_FORCE:
+	case MASTER_SLAVE_CFG_SLAVE_FORCE:
+		return true;
+	}
+
+	return false;
+}
+
 static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 				  struct ethtool_link_ksettings *ksettings,
 				  bool *mod)
 {
 	struct ethtool_link_settings *lsettings = &ksettings->base;
 	bool req_speed, req_duplex;
+	const struct nlattr *master_slave_cfg;
 	int ret;
 
+	master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
+	if (master_slave_cfg) {
+		u8 cfg = nla_get_u8(master_slave_cfg);
+
+		if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) {
+			NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+					    "master/slave configuration not supported by device");
+			return -EOPNOTSUPP;
+		}
+
+		if (!ethnl_validate_master_slave_cfg(cfg)) {
+			NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+					    "master/slave value is invalid");
+			return -EOPNOTSUPP;
+		}
+	}
+
 	*mod = false;
 	req_speed = tb[ETHTOOL_A_LINKMODES_SPEED];
 	req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX];
@@ -311,6 +363,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 			 mod);
 	ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX],
 			mod);
+	ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod);
 
 	if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg &&
 	    (req_speed || req_duplex) &&
-- 
cgit v1.2.3


From 6e728f321393b1fce9e1c2c3e55f9f7c15991321 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 7 May 2020 18:23:05 +0530
Subject: net: qrtr: Add MHI transport layer

MHI is the transport layer used for communicating to the external modems.
Hence, this commit adds MHI transport layer support to QRTR for
transferring the QMI messages over IPC Router.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/Kconfig  |   7 +++
 net/qrtr/Makefile |   2 +
 net/qrtr/mhi.c    | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+)
 create mode 100644 net/qrtr/mhi.c

(limited to 'net')

diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 63f89cc6e82c..8eb876471564 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -29,4 +29,11 @@ config QRTR_TUN
 	  implement endpoints of QRTR, for purpose of tunneling data to other
 	  hosts or testing purposes.
 
+config QRTR_MHI
+	tristate "MHI IPC Router channels"
+	depends on MHI_BUS
+	help
+	  Say Y here to support MHI based ipcrouter channels. MHI is the
+	  transport used for communicating to external modems.
+
 endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index 32d4e923925d..1b1411d158a7 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -5,3 +5,5 @@ obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
 qrtr-smd-y	:= smd.o
 obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
 qrtr-tun-y	:= tun.o
+obj-$(CONFIG_QRTR_MHI) += qrtr-mhi.o
+qrtr-mhi-y	:= mhi.o
diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c
new file mode 100644
index 000000000000..ff0c41467fc1
--- /dev/null
+++ b/net/qrtr/mhi.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2020, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/mhi.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#include "qrtr.h"
+
+struct qrtr_mhi_dev {
+	struct qrtr_endpoint ep;
+	struct mhi_device *mhi_dev;
+	struct device *dev;
+};
+
+/* From MHI to QRTR */
+static void qcom_mhi_qrtr_dl_callback(struct mhi_device *mhi_dev,
+				      struct mhi_result *mhi_res)
+{
+	struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev);
+	int rc;
+
+	if (!qdev || mhi_res->transaction_status)
+		return;
+
+	rc = qrtr_endpoint_post(&qdev->ep, mhi_res->buf_addr,
+				mhi_res->bytes_xferd);
+	if (rc == -EINVAL)
+		dev_err(qdev->dev, "invalid ipcrouter packet\n");
+}
+
+/* From QRTR to MHI */
+static void qcom_mhi_qrtr_ul_callback(struct mhi_device *mhi_dev,
+				      struct mhi_result *mhi_res)
+{
+	struct sk_buff *skb = mhi_res->buf_addr;
+
+	if (skb->sk)
+		sock_put(skb->sk);
+	consume_skb(skb);
+}
+
+/* Send data over MHI */
+static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+	struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep);
+	int rc;
+
+	rc = skb_linearize(skb);
+	if (rc)
+		goto free_skb;
+
+	rc = mhi_queue_skb(qdev->mhi_dev, DMA_TO_DEVICE, skb, skb->len,
+			   MHI_EOT);
+	if (rc)
+		goto free_skb;
+
+	if (skb->sk)
+		sock_hold(skb->sk);
+
+	return rc;
+
+free_skb:
+	kfree_skb(skb);
+
+	return rc;
+}
+
+static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev,
+			       const struct mhi_device_id *id)
+{
+	struct qrtr_mhi_dev *qdev;
+	int rc;
+
+	qdev = devm_kzalloc(&mhi_dev->dev, sizeof(*qdev), GFP_KERNEL);
+	if (!qdev)
+		return -ENOMEM;
+
+	qdev->mhi_dev = mhi_dev;
+	qdev->dev = &mhi_dev->dev;
+	qdev->ep.xmit = qcom_mhi_qrtr_send;
+
+	dev_set_drvdata(&mhi_dev->dev, qdev);
+	rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO);
+	if (rc)
+		return rc;
+
+	dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n");
+
+	return 0;
+}
+
+static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev)
+{
+	struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev);
+
+	qrtr_endpoint_unregister(&qdev->ep);
+	dev_set_drvdata(&mhi_dev->dev, NULL);
+}
+
+static const struct mhi_device_id qcom_mhi_qrtr_id_table[] = {
+	{ .chan = "IPCR" },
+	{}
+};
+MODULE_DEVICE_TABLE(mhi, qcom_mhi_qrtr_id_table);
+
+static struct mhi_driver qcom_mhi_qrtr_driver = {
+	.probe = qcom_mhi_qrtr_probe,
+	.remove = qcom_mhi_qrtr_remove,
+	.dl_xfer_cb = qcom_mhi_qrtr_dl_callback,
+	.ul_xfer_cb = qcom_mhi_qrtr_ul_callback,
+	.id_table = qcom_mhi_qrtr_id_table,
+	.driver = {
+		.name = "qcom_mhi_qrtr",
+	},
+};
+
+module_mhi_driver(qcom_mhi_qrtr_driver);
+
+MODULE_AUTHOR("Chris Lew <clew@codeaurora.org>");
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_DESCRIPTION("Qualcomm IPC-Router MHI interface driver");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From e42671084361302141a09284fde9bbc14fdd16bf Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 7 May 2020 18:23:06 +0530
Subject: net: qrtr: Do not depend on ARCH_QCOM

IPC Router protocol is also used by external modems for exchanging the QMI
messages. Hence, it doesn't always depend on Qualcomm platforms. One such
instance is the QCA6390 WLAN device connected to x86 machine.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 8eb876471564..f362ca316015 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -4,7 +4,6 @@
 
 config QRTR
 	tristate "Qualcomm IPC Router support"
-	depends on ARCH_QCOM || COMPILE_TEST
 	---help---
 	  Say Y if you intend to use Qualcomm IPC router protocol.  The
 	  protocol is used to communicate with services provided by other
-- 
cgit v1.2.3


From e1eea8112017cbdc596d90caf6ede191502a9691 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 5 May 2020 22:20:52 +0300
Subject: net: dsa: introduce a dsa_port_from_netdev public helper

As its implementation shows, this is synonimous with calling
dsa_slave_dev_check followed by dsa_slave_to_port, so it is quite simple
already and provides functionality which is already there.

However there is now a need for these functions outside dsa_priv.h, for
example in drivers that perform mirroring and redirection through
tc-flower offloads (they are given raw access to the flow_cls_offload
structure), where they need to call this function on act->dev.

But simply exporting dsa_slave_to_port would make it non-inline and
would result in an extra function call in the hotpath, as can be seen
for example in sja1105:

Before:

000006dc <sja1105_xmit>:
{
 6dc:	e92d4ff0 	push	{r4, r5, r6, r7, r8, r9, sl, fp, lr}
 6e0:	e1a04000 	mov	r4, r0
 6e4:	e591958c 	ldr	r9, [r1, #1420]	; 0x58c <- Inline dsa_slave_to_port
 6e8:	e1a05001 	mov	r5, r1
 6ec:	e24dd004 	sub	sp, sp, #4
	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 6f0:	e1c901d8 	ldrd	r0, [r9, #24]
 6f4:	ebfffffe 	bl	0 <dsa_8021q_tx_vid>
			6f4: R_ARM_CALL	dsa_8021q_tx_vid
	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 6f8:	e1d416b0 	ldrh	r1, [r4, #96]	; 0x60
	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 6fc:	e1a08000 	mov	r8, r0

After:

000006e4 <sja1105_xmit>:
{
 6e4:	e92d4ff0 	push	{r4, r5, r6, r7, r8, r9, sl, fp, lr}
 6e8:	e1a04000 	mov	r4, r0
 6ec:	e24dd004 	sub	sp, sp, #4
	struct dsa_port *dp = dsa_slave_to_port(netdev);
 6f0:	e1a00001 	mov	r0, r1
{
 6f4:	e1a05001 	mov	r5, r1
	struct dsa_port *dp = dsa_slave_to_port(netdev);
 6f8:	ebfffffe 	bl	0 <dsa_slave_to_port>
			6f8: R_ARM_CALL	dsa_slave_to_port
 6fc:	e1a09000 	mov	r9, r0
	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 700:	e1c001d8 	ldrd	r0, [r0, #24]
 704:	ebfffffe 	bl	0 <dsa_8021q_tx_vid>
			704: R_ARM_CALL	dsa_8021q_tx_vid

Because we want to avoid possible performance regressions, introduce
this new function which is designed to be public.

Suggested-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 net/dsa/dsa.c     | 9 +++++++++
 2 files changed, 10 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index fb3f9222f2a1..6dfc8c2f68b8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -637,6 +637,7 @@ void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
 					   void *occ_get_priv);
 void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
 					     u64 resource_id);
+struct dsa_port *dsa_port_from_netdev(struct net_device *netdev);
 
 struct dsa_devlink_priv {
 	struct dsa_switch *ds;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 0384a911779e..1ce9ba8cf545 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -412,6 +412,15 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
 }
 EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
 
+struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
+{
+	if (!netdev || !dsa_slave_dev_check(netdev))
+		return ERR_PTR(-ENODEV);
+
+	return dsa_slave_to_port(netdev);
+}
+EXPORT_SYMBOL_GPL(dsa_port_from_netdev);
+
 static int __init dsa_init_module(void)
 {
 	int rc;
-- 
cgit v1.2.3


From f96e87178bb819a2ee25623a6935fa850c2defdd Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 6 May 2020 15:47:45 +0000
Subject: hsr: remove WARN_ONCE() in hsr_fill_frame_info()

When VLAN frame is being sent, hsr calls WARN_ONCE() because hsr doesn't
support VLAN. But using WARN_ONCE() is overdoing.
Using netdev_warn_once() is enough.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/hsr/hsr_forward.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index ddd9605bad04..ed13760463de 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -321,7 +321,7 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame,
 	if (ethhdr->h_proto == htons(ETH_P_8021Q)) {
 		frame->is_vlan = true;
 		/* FIXME: */
-		WARN_ONCE(1, "HSR: VLAN not yet supported");
+		netdev_warn_once(skb->dev, "VLAN not yet supported");
 	}
 	if (ethhdr->h_proto == htons(ETH_P_PRP) ||
 	    ethhdr->h_proto == htons(ETH_P_HSR)) {
-- 
cgit v1.2.3


From c75a33c84b83ffbb8b8b58a6bf4dea69dba21326 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 6 May 2020 17:58:27 -0700
Subject: net: remove newlines in NL_SET_ERR_MSG_MOD

The NL_SET_ERR_MSG_MOD macro is used to report a string describing an
error message to userspace via the netlink extended ACK structure. It
should not have a trailing newline.

Add a cocci script which catches cases where the newline marker is
present. Using this script, fix the handful of cases which accidentally
included a trailing new line.

I couldn't figure out a way to get a patch mode working, so this script
only implements context, report, and org.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c |  2 +-
 drivers/net/ethernet/mscc/ocelot_tc.c           |  6 +-
 net/bridge/br_mrp_netlink.c                     |  2 +-
 net/bridge/br_stp_if.c                          |  2 +-
 net/dsa/slave.c                                 |  6 +-
 scripts/coccinelle/misc/newline_in_nl_msg.cocci | 75 +++++++++++++++++++++++++
 6 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 scripts/coccinelle/misc/newline_in_nl_msg.cocci

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 77397aa66810..a050808f2128 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1097,7 +1097,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		if (IS_ERR(priv->fs.tc.t)) {
 			mutex_unlock(&priv->fs.tc.t_lock);
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Failed to create tc offload table\n");
+					   "Failed to create tc offload table");
 			netdev_err(priv->netdev,
 				   "Failed to create tc offload table\n");
 			return PTR_ERR(priv->fs.tc.t);
diff --git a/drivers/net/ethernet/mscc/ocelot_tc.c b/drivers/net/ethernet/mscc/ocelot_tc.c
index d326e231f0ad..b7baf7624e18 100644
--- a/drivers/net/ethernet/mscc/ocelot_tc.c
+++ b/drivers/net/ethernet/mscc/ocelot_tc.c
@@ -48,7 +48,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv,
 
 		if (priv->tc.police_id && priv->tc.police_id != f->cookie) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Only one policer per port is supported\n");
+					   "Only one policer per port is supported");
 			return -EEXIST;
 		}
 
@@ -59,7 +59,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv,
 
 		err = ocelot_port_policer_add(ocelot, port, &pol);
 		if (err) {
-			NL_SET_ERR_MSG_MOD(extack, "Could not add policer\n");
+			NL_SET_ERR_MSG_MOD(extack, "Could not add policer");
 			return err;
 		}
 
@@ -73,7 +73,7 @@ static int ocelot_setup_tc_cls_matchall(struct ocelot_port_private *priv,
 		err = ocelot_port_policer_del(ocelot, port);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Could not delete policer\n");
+					   "Could not delete policer");
 			return err;
 		}
 		priv->tc.police_id = 0;
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index 503896638be0..397e7f710772 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -28,7 +28,7 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
 	int err;
 
 	if (br->stp_enabled != BR_NO_STP) {
-		NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n");
+		NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled");
 		return -EINVAL;
 	}
 
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a42850b7eb9a..ba55851fe132 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -203,7 +203,7 @@ int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
 
 	if (br_mrp_enabled(br)) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "STP can't be enabled if MRP is already enabled\n");
+				   "STP can't be enabled if MRP is already enabled");
 		return -EINVAL;
 	}
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ea0fcf7bf786..dfb4282fc339 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -911,13 +911,13 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev,
 
 	if (!ds->ops->port_policer_add) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "Policing offload not implemented\n");
+				   "Policing offload not implemented");
 		return -EOPNOTSUPP;
 	}
 
 	if (!ingress) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "Only supported on ingress qdisc\n");
+				   "Only supported on ingress qdisc");
 		return -EOPNOTSUPP;
 	}
 
@@ -928,7 +928,7 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev,
 	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
 		if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Only one port policer allowed\n");
+					   "Only one port policer allowed");
 			return -EEXIST;
 		}
 	}
diff --git a/scripts/coccinelle/misc/newline_in_nl_msg.cocci b/scripts/coccinelle/misc/newline_in_nl_msg.cocci
new file mode 100644
index 000000000000..c175886e4015
--- /dev/null
+++ b/scripts/coccinelle/misc/newline_in_nl_msg.cocci
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+///
+/// Catch strings ending in newline with GENL_SET_ERR_MSG, NL_SET_ERR_MSG,
+/// NL_SET_ERR_MSG_MOD.
+///
+// Confidence: Very High
+// Copyright: (C) 2020 Intel Corporation
+// URL: http://coccinelle.lip6.fr/
+// Options: --no-includes --include-headers
+
+virtual context
+virtual org
+virtual report
+
+@r depends on context || org || report@
+expression e;
+constant m;
+position p;
+@@
+  \(GENL_SET_ERR_MSG\|NL_SET_ERR_MSG\|NL_SET_ERR_MSG_MOD\)(e,m@p)
+
+@script:python@
+m << r.m;
+@@
+
+if not m.endswith("\\n\""):
+	cocci.include_match(False)
+
+@r1 depends on r@
+identifier fname;
+expression r.e;
+constant r.m;
+position r.p;
+@@
+  fname(e,m@p)
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on context && r@
+identifier r1.fname;
+expression r.e;
+constant r.m;
+@@
+* fname(e,m)
+
+//----------------------------------------------------------
+//  For org mode
+//----------------------------------------------------------
+
+@script:python depends on org@
+fname << r1.fname;
+m << r.m;
+p << r.p;
+@@
+
+if m.endswith("\\n\""):
+	msg="WARNING avoid newline at end of message in %s" % (fname)
+	msg_safe=msg.replace("[","@(").replace("]",")")
+	coccilib.org.print_todo(p[0], msg_safe)
+
+//----------------------------------------------------------
+//  For report mode
+//----------------------------------------------------------
+
+@script:python depends on report@
+fname << r1.fname;
+m << r.m;
+p << r.p;
+@@
+
+if m.endswith("\\n\""):
+	msg="WARNING avoid newline at end of message in %s" % (fname)
+	coccilib.report.print_report(p[0], msg)
-- 
cgit v1.2.3


From 636ef28d6e4d174e424102466caf572b0406fb0e Mon Sep 17 00:00:00 2001
From: zhang kai <zhangkaiheb@126.com>
Date: Thu, 7 May 2020 11:08:30 +0800
Subject: tcp: tcp_mark_head_lost is only valid for sack-tcp

so tcp_is_sack/reno checks are removed from tcp_mark_head_lost.

Signed-off-by: zhang kai <zhangkaiheb@126.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 66e55e51c550..7529c2816f2f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2183,8 +2183,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 }
 
 /* Detect loss in event "A" above by marking head of queue up as lost.
- * For non-SACK(Reno) senders, the first "packets" number of segments
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * For RFC3517 SACK, a segment is considered lost if it
  * has at least tp->reordering SACKed seqments above it; "packets" refers to
  * the maximum SACKed segments to pass before reaching this limit.
  */
@@ -2192,10 +2191,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int cnt, oldcnt, lost;
-	unsigned int mss;
+	int cnt;
 	/* Use SACK to deduce losses of new sequences sent during recovery */
-	const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
+	const u32 loss_high = tp->snd_nxt;
 
 	WARN_ON(packets > tp->packets_out);
 	skb = tp->lost_skb_hint;
@@ -2218,26 +2216,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 		if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
 			break;
 
-		oldcnt = cnt;
-		if (tcp_is_reno(tp) ||
-		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
 			cnt += tcp_skb_pcount(skb);
 
-		if (cnt > packets) {
-			if (tcp_is_sack(tp) ||
-			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
-			    (oldcnt >= packets))
-				break;
-
-			mss = tcp_skb_mss(skb);
-			/* If needed, chop off the prefix to mark as lost. */
-			lost = (packets - oldcnt) * mss;
-			if (lost < skb->len &&
-			    tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
-					 lost, mss, GFP_ATOMIC) < 0)
-				break;
-			cnt = packets;
-		}
+		if (cnt > packets)
+			break;
 
 		tcp_skb_mark_lost(tp, skb);
 
@@ -2849,8 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 			if (tcp_try_undo_partial(sk, prior_snd_una))
 				return;
 			/* Partial ACK arrived. Force fast retransmit. */
-			do_lost = tcp_is_reno(tp) ||
-				  tcp_force_fast_retransmit(sk);
+			do_lost = tcp_force_fast_retransmit(sk);
 		}
 		if (tcp_try_undo_dsack(sk)) {
 			tcp_try_keep_open(sk);
-- 
cgit v1.2.3


From ca7e3edc221d5cf750ae04cac29cf9fe9db38e84 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 7 May 2020 16:24:06 +0200
Subject: net/smc: remove set but not used variables 'del_llc, del_llc_resp'

Fixes gcc '-Wunused-but-set-variable' warning:

net/smc/smc_llc.c: In function 'smc_llc_cli_conf_link':
net/smc/smc_llc.c:753:31: warning:
 variable 'del_llc' set but not used [-Wunused-but-set-variable]
  struct smc_llc_msg_del_link *del_llc;
                               ^
net/smc/smc_llc.c: In function 'smc_llc_process_srv_delete_link':
net/smc/smc_llc.c:1311:33: warning:
 variable 'del_llc_resp' set but not used [-Wunused-but-set-variable]
    struct smc_llc_msg_del_link *del_llc_resp;
                                 ^

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 4cc583678ac7..391237b601fe 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -750,7 +750,6 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 				 enum smc_lgr_type lgr_new_t)
 {
 	struct smc_link_group *lgr = link->lgr;
-	struct smc_llc_msg_del_link *del_llc;
 	struct smc_llc_qentry *qentry = NULL;
 	int rc = 0;
 
@@ -764,7 +763,6 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 	}
 	if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
 		/* received DELETE_LINK instead */
-		del_llc = &qentry->msg.delete_link;
 		qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
 		smc_llc_send_message(link, &qentry->msg);
 		smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
@@ -1308,16 +1306,12 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
 		 * enqueued DELETE_LINK request (forward it)
 		 */
 		if (!smc_llc_send_message(lnk, &qentry->msg)) {
-			struct smc_llc_msg_del_link *del_llc_resp;
 			struct smc_llc_qentry *qentry2;
 
 			qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
 					       SMC_LLC_DELETE_LINK);
-			if (!qentry2) {
-			} else {
-				del_llc_resp = &qentry2->msg.delete_link;
+			if (qentry2)
 				smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
-			}
 		}
 	}
 	smcr_link_clear(lnk_del, true);
-- 
cgit v1.2.3


From 307f660d056b5eb8f5bb2328fac3915ab75b5007 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 09:32:18 -0700
Subject: netpoll: remove dev argument from netpoll_send_skb_on_dev()

netpoll_send_skb_on_dev() can get the device pointer directly from np->dev

Rename it to __netpoll_send_skb()

Following patch will move netpoll_send_skb() out-of-line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  5 ++---
 net/core/netpoll.c      | 10 ++++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 676f1ff161a9..00e0bae3d402 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -63,13 +63,12 @@ int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
-void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
-			     struct net_device *dev);
+void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	unsigned long flags;
 	local_irq_save(flags);
-	netpoll_send_skb_on_dev(np, skb, np->dev);
+	__netpoll_send_skb(np, skb);
 	local_irq_restore(flags);
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 15b366a1a958..c5059b7ffc94 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -305,17 +305,19 @@ static int netpoll_owner_active(struct net_device *dev)
 }
 
 /* call with IRQ disabled */
-void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
-			     struct net_device *dev)
+void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	netdev_tx_t status = NETDEV_TX_BUSY;
+	struct net_device *dev;
 	unsigned long tries;
 	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo;
 
 	lockdep_assert_irqs_disabled();
 
-	npinfo = rcu_dereference_bh(np->dev->npinfo);
+	dev = np->dev;
+	npinfo = rcu_dereference_bh(dev->npinfo);
+
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
 		dev_kfree_skb_irq(skb);
 		return;
@@ -358,7 +360,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
 }
-EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+EXPORT_SYMBOL(__netpoll_send_skb);
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 {
-- 
cgit v1.2.3


From fb1eee476b0d3be3e58dac1a3a96f726c6278bed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 09:32:19 -0700
Subject: netpoll: move netpoll_send_skb() out of line

There is no need to inline this helper, as we intend to add more
code in this function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  9 +--------
 net/core/netpoll.c      | 13 +++++++++++--
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 00e0bae3d402..e466ddffef61 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -63,14 +63,7 @@ int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
-void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
-static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
-{
-	unsigned long flags;
-	local_irq_save(flags);
-	__netpoll_send_skb(np, skb);
-	local_irq_restore(flags);
-}
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 
 #ifdef CONFIG_NETPOLL
 static inline void *netpoll_poll_lock(struct napi_struct *napi)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c5059b7ffc94..34cd34f24423 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev)
 }
 
 /* call with IRQ disabled */
-void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	netdev_tx_t status = NETDEV_TX_BUSY;
 	struct net_device *dev;
@@ -360,7 +360,16 @@ void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
 }
-EXPORT_SYMBOL(__netpoll_send_skb);
+
+void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__netpoll_send_skb(np, skb);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(netpoll_send_skb);
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 {
-- 
cgit v1.2.3


From 1ddabdfaf70c202b88925edd74c66f4707dbd92e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 09:32:20 -0700
Subject: netpoll: netpoll_send_skb() returns transmit status

Some callers want to know if the packet has been sent or
dropped, to inform upper stacks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  2 +-
 net/core/netpoll.c      | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e466ddffef61..f47af135bd56 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -63,7 +63,7 @@ int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 
 #ifdef CONFIG_NETPOLL
 static inline void *netpoll_poll_lock(struct napi_struct *napi)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 34cd34f24423..40d2753aa47d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev)
 }
 
 /* call with IRQ disabled */
-static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	netdev_tx_t status = NETDEV_TX_BUSY;
 	struct net_device *dev;
@@ -320,7 +320,7 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
 		dev_kfree_skb_irq(skb);
-		return;
+		return NET_XMIT_DROP;
 	}
 
 	/* don't get messages out of order, and no recursion */
@@ -359,15 +359,18 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		skb_queue_tail(&npinfo->txq, skb);
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
+	return NETDEV_TX_OK;
 }
 
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	unsigned long flags;
+	netdev_tx_t ret;
 
 	local_irq_save(flags);
-	__netpoll_send_skb(np, skb);
+	ret = __netpoll_send_skb(np, skb);
 	local_irq_restore(flags);
+	return ret;
 }
 EXPORT_SYMBOL(netpoll_send_skb);
 
-- 
cgit v1.2.3


From f78ed2204db9fc35b545d693865bddbe0149aa1f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 09:32:21 -0700
Subject: netpoll: accept NULL np argument in netpoll_send_skb()

netpoll_send_skb() callers seem to leak skb if
the np pointer is NULL. While this should not happen, we
can make the code more robust.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c   |  5 ++---
 include/linux/if_team.h |  5 +----
 include/net/bonding.h   |  5 +----
 net/8021q/vlan_dev.c    |  5 ++---
 net/bridge/br_private.h |  5 +----
 net/core/netpoll.c      | 11 ++++++++---
 net/dsa/slave.c         |  5 ++---
 7 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 34eb073cdd74..9a419d5102ce 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -542,12 +542,11 @@ xmit_world:
 static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	if (vlan->netpoll)
-		netpoll_send_skb(vlan->netpoll, skb);
+	return netpoll_send_skb(vlan->netpoll, skb);
 #else
 	BUG();
-#endif
 	return NETDEV_TX_OK;
+#endif
 }
 
 static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index ec7e4bd07f82..537dc2b8c879 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -102,10 +102,7 @@ static inline bool team_port_dev_txable(const struct net_device *port_dev)
 static inline void team_netpoll_send_skb(struct team_port *port,
 					 struct sk_buff *skb)
 {
-	struct netpoll *np = port->np;
-
-	if (np)
-		netpoll_send_skb(np, skb);
+	netpoll_send_skb(port->np, skb);
 }
 #else
 static inline void team_netpoll_send_skb(struct team_port *port,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 0b696da5c115..f211983cd52a 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -507,10 +507,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond,
 static inline void bond_netpoll_send_skb(const struct slave *slave,
 					 struct sk_buff *skb)
 {
-	struct netpoll *np = slave->np;
-
-	if (np)
-		netpoll_send_skb(np, skb);
+	netpoll_send_skb(slave->np, skb);
 }
 #else
 static inline void bond_netpoll_send_skb(const struct slave *slave,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 319220b2341d..f00bb57f0f60 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -88,12 +88,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_POLL_CONTROLLER
-	if (vlan->netpoll)
-		netpoll_send_skb(vlan->netpoll, skb);
+	return netpoll_send_skb(vlan->netpoll, skb);
 #else
 	BUG();
-#endif
 	return NETDEV_TX_OK;
+#endif
 }
 
 static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 78d3a951180d..4dc21e8f7e33 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -598,10 +598,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
 static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
 				       struct sk_buff *skb)
 {
-	struct netpoll *np = p->np;
-
-	if (np)
-		netpoll_send_skb(np, skb);
+	netpoll_send_skb(p->np, skb);
 }
 
 int br_netpoll_enable(struct net_bridge_port *p);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 40d2753aa47d..093e90e52bc2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -367,9 +367,14 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 	unsigned long flags;
 	netdev_tx_t ret;
 
-	local_irq_save(flags);
-	ret = __netpoll_send_skb(np, skb);
-	local_irq_restore(flags);
+	if (unlikely(!np)) {
+		dev_kfree_skb_irq(skb);
+		ret = NET_XMIT_DROP;
+	} else {
+		local_irq_save(flags);
+		ret = __netpoll_send_skb(np, skb);
+		local_irq_restore(flags);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(netpoll_send_skb);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index dfb4282fc339..61b0de52040a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -445,12 +445,11 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct dsa_slave_priv *p = netdev_priv(dev);
 
-	if (p->netpoll)
-		netpoll_send_skb(p->netpoll, skb);
+	return netpoll_send_skb(p->netpoll, skb);
 #else
 	BUG();
-#endif
 	return NETDEV_TX_OK;
+#endif
 }
 
 static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
-- 
cgit v1.2.3


From 790709f249728640faa4eff38286a9feb34fed81 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 10:05:39 -0700
Subject: net: relax SO_TXTIME CAP_NET_ADMIN check

Now sch_fq has horizon feature, we want to allow QUIC/UDP applications
to use EDT model so that pacing can be offloaded to the kernel (sch_fq)
or the NIC.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index b714162213ae..fd85e651ce28 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1152,23 +1152,31 @@ set_rcvbuf:
 		break;
 
 	case SO_TXTIME:
-		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
-			ret = -EPERM;
-		} else if (optlen != sizeof(struct sock_txtime)) {
+		if (optlen != sizeof(struct sock_txtime)) {
 			ret = -EINVAL;
+			break;
 		} else if (copy_from_user(&sk_txtime, optval,
 			   sizeof(struct sock_txtime))) {
 			ret = -EFAULT;
+			break;
 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
 			ret = -EINVAL;
-		} else {
-			sock_valbool_flag(sk, SOCK_TXTIME, true);
-			sk->sk_clockid = sk_txtime.clockid;
-			sk->sk_txtime_deadline_mode =
-				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
-			sk->sk_txtime_report_errors =
-				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+			break;
+		}
+		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+		 * scheduler has enough safe guards.
+		 */
+		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+			ret = -EPERM;
+			break;
 		}
+		sock_valbool_flag(sk, SOCK_TXTIME, true);
+		sk->sk_clockid = sk_txtime.clockid;
+		sk->sk_txtime_deadline_mode =
+			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+		sk->sk_txtime_report_errors =
+			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
 		break;
 
 	case SO_BINDTOIFINDEX:
-- 
cgit v1.2.3


From cb0721c7e200750907bb8ef59b12646a5cb2dadf Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 8 May 2020 10:46:10 -0700
Subject: net: Refactor arguments of inet{,6}_bind

The intent is to add an additional bind parameter in the next commit.
Instead of adding another argument, let's convert all existing
flag arguments into an extendable bit field.

No functional changes.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200508174611.228805-4-sdf@google.com
---
 include/net/inet_common.h |  6 +++++-
 include/net/ipv6_stubs.h  |  2 +-
 net/core/filter.c         |  6 ++++--
 net/ipv4/af_inet.c        | 10 +++++-----
 net/ipv6/af_inet6.c       | 10 +++++-----
 5 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index ae2ba897675c..c38f4f7d660a 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -35,8 +35,12 @@ int inet_shutdown(struct socket *sock, int how);
 int inet_listen(struct socket *sock, int backlog);
 void inet_sock_destruct(struct sock *sk);
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+/* Don't allocate port at this moment, defer to connect. */
+#define BIND_FORCE_ADDRESS_NO_PORT	(1 << 0)
+/* Grab and release socket lock. */
+#define BIND_WITH_LOCK			(1 << 1)
 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-		bool force_bind_address_no_port, bool with_lock);
+		u32 flags);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		 int peer);
 int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index a5f7c12c326a..6e622dd3122e 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
 /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
 struct ipv6_bpf_stub {
 	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-			  bool force_bind_address_no_port, bool with_lock);
+			  u32 flags);
 	struct sock *(*udp6_lib_lookup)(struct net *net,
 				     const struct in6_addr *saddr, __be16 sport,
 				     const struct in6_addr *daddr, __be16 dport,
diff --git a/net/core/filter.c b/net/core/filter.c
index dfaf5df13722..fa9ddab5dd1f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4538,7 +4538,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
 			return err;
 		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
 			return err;
-		return __inet_bind(sk, addr, addr_len, true, false);
+		return __inet_bind(sk, addr, addr_len,
+				   BIND_FORCE_ADDRESS_NO_PORT);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (addr->sa_family == AF_INET6) {
 		if (addr_len < SIN6_LEN_RFC2133)
@@ -4548,7 +4549,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
 		/* ipv6_bpf_stub cannot be NULL, since it's called from
 		 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
 		 */
-		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len,
+						 BIND_FORCE_ADDRESS_NO_PORT);
 #endif /* CONFIG_IPV6 */
 	}
 #endif /* CONFIG_INET */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6177c4ba0037..68e74b1b0f26 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		return err;
 
-	return __inet_bind(sk, uaddr, addr_len, false, true);
+	return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
 }
 EXPORT_SYMBOL(inet_bind);
 
 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-		bool force_bind_address_no_port, bool with_lock)
+		u32 flags)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
@@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 	 *      would be illegal to use them (multicast/broadcast) in
 	 *      which case the sending device address is used.
 	 */
-	if (with_lock)
+	if (flags & BIND_WITH_LOCK)
 		lock_sock(sk);
 
 	/* Check these errors (active socket, double bind). */
@@ -520,7 +520,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 
 	/* Make sure we are allowed to bind here. */
 	if (snum || !(inet->bind_address_no_port ||
-		      force_bind_address_no_port)) {
+		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
 		if (sk->sk_prot->get_port(sk, snum)) {
 			inet->inet_saddr = inet->inet_rcv_saddr = 0;
 			err = -EADDRINUSE;
@@ -543,7 +543,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 	sk_dst_reset(sk);
 	err = 0;
 out_release_sock:
-	if (with_lock)
+	if (flags & BIND_WITH_LOCK)
 		release_sock(sk);
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 345baa0a754f..552c2592b81c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,7 +273,7 @@ out_rcu_unlock:
 }
 
 static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-			bool force_bind_address_no_port, bool with_lock)
+			u32 flags)
 {
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
@@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
-	if (with_lock)
+	if (flags & BIND_WITH_LOCK)
 		lock_sock(sk);
 
 	/* Check these errors (active socket, double bind). */
@@ -400,7 +400,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 
 	/* Make sure we are allowed to bind here. */
 	if (snum || !(inet->bind_address_no_port ||
-		      force_bind_address_no_port)) {
+		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
 		if (sk->sk_prot->get_port(sk, snum)) {
 			sk->sk_ipv6only = saved_ipv6only;
 			inet_reset_saddr(sk);
@@ -423,7 +423,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 	inet->inet_dport = 0;
 	inet->inet_daddr = 0;
 out:
-	if (with_lock)
+	if (flags & BIND_WITH_LOCK)
 		release_sock(sk);
 	return err;
 out_unlock:
@@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		return err;
 
-	return __inet6_bind(sk, uaddr, addr_len, false, true);
+	return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
 }
 EXPORT_SYMBOL(inet6_bind);
 
-- 
cgit v1.2.3


From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 8 May 2020 10:46:11 -0700
Subject: bpf: Allow any port in bpf_bind helper

We want to have a tighter control on what ports we bind to in
the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means
connect() becomes slightly more expensive. The expensive part
comes from the fact that we now need to call inet_csk_get_port()
that verifies that the port is not used and allocates an entry
in the hash table for it.

Since we can't rely on "snum || !bind_address_no_port" to prevent
us from calling POST_BIND hook anymore, let's add another bind flag
to indicate that the call site is BPF program.

v5:
* fix wrong AF_INET (should be AF_INET6) in the bpf program for v6

v3:
* More bpf_bind documentation refinements (Martin KaFai Lau)
* Add UDP tests as well (Martin KaFai Lau)
* Don't start the thread, just do socket+bind+listen (Martin KaFai Lau)

v2:
* Update documentation (Andrey Ignatov)
* Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov)

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com
---
 include/net/inet_common.h                          |   2 +
 include/uapi/linux/bpf.h                           |   9 +-
 net/core/filter.c                                  |  18 ++--
 net/ipv4/af_inet.c                                 |  10 +-
 net/ipv6/af_inet6.c                                |  12 ++-
 tools/include/uapi/linux/bpf.h                     |   9 +-
 .../selftests/bpf/prog_tests/connect_force_port.c  | 115 +++++++++++++++++++++
 .../selftests/bpf/progs/connect_force_port4.c      |  28 +++++
 .../selftests/bpf/progs/connect_force_port6.c      |  28 +++++
 9 files changed, 203 insertions(+), 28 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c
 create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c
 create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c

(limited to 'net')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index c38f4f7d660a..cb2818862919 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 #define BIND_FORCE_ADDRESS_NO_PORT	(1 << 0)
 /* Grab and release socket lock. */
 #define BIND_WITH_LOCK			(1 << 1)
+/* Called from BPF program. */
+#define BIND_FROM_BPF			(1 << 2)
 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		u32 flags);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b3643e27e264..6e5e7caa3739 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1994,10 +1994,11 @@ union bpf_attr {
  *
  * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
  * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
- * 		**AF_INET6**). Looking for a free port to bind to can be
- * 		expensive, therefore binding to port is not permitted by the
- * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
- * 		must be set to zero.
+ * 		**AF_INET6**). It's advised to pass zero port (**sin_port**
+ * 		or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
+ * 		behavior and lets the kernel efficiently pick up an unused
+ * 		port as long as 4-tuple is unique. Passing non-zero port might
+ * 		lead to degraded performance.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
diff --git a/net/core/filter.c b/net/core/filter.c
index fa9ddab5dd1f..da0634979f53 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
 {
 #ifdef CONFIG_INET
 	struct sock *sk = ctx->sk;
+	u32 flags = BIND_FROM_BPF;
 	int err;
 
-	/* Binding to port can be expensive so it's prohibited in the helper.
-	 * Only binding to IP is supported.
-	 */
 	err = -EINVAL;
 	if (addr_len < offsetofend(struct sockaddr, sa_family))
 		return err;
 	if (addr->sa_family == AF_INET) {
 		if (addr_len < sizeof(struct sockaddr_in))
 			return err;
-		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
-			return err;
-		return __inet_bind(sk, addr, addr_len,
-				   BIND_FORCE_ADDRESS_NO_PORT);
+		if (((struct sockaddr_in *)addr)->sin_port == htons(0))
+			flags |= BIND_FORCE_ADDRESS_NO_PORT;
+		return __inet_bind(sk, addr, addr_len, flags);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (addr->sa_family == AF_INET6) {
 		if (addr_len < SIN6_LEN_RFC2133)
 			return err;
-		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
-			return err;
+		if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
+			flags |= BIND_FORCE_ADDRESS_NO_PORT;
 		/* ipv6_bpf_stub cannot be NULL, since it's called from
 		 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
 		 */
-		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len,
-						 BIND_FORCE_ADDRESS_NO_PORT);
+		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
 #endif /* CONFIG_IPV6 */
 	}
 #endif /* CONFIG_INET */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 68e74b1b0f26..fcf0d12a407a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 			err = -EADDRINUSE;
 			goto out_release_sock;
 		}
-		err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
-		if (err) {
-			inet->inet_saddr = inet->inet_rcv_saddr = 0;
-			goto out_release_sock;
+		if (!(flags & BIND_FROM_BPF)) {
+			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+			if (err) {
+				inet->inet_saddr = inet->inet_rcv_saddr = 0;
+				goto out_release_sock;
+			}
 		}
 	}
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 552c2592b81c..771a462a8322 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 			err = -EADDRINUSE;
 			goto out;
 		}
-		err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
-		if (err) {
-			sk->sk_ipv6only = saved_ipv6only;
-			inet_reset_saddr(sk);
-			goto out;
+		if (!(flags & BIND_FROM_BPF)) {
+			err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+			if (err) {
+				sk->sk_ipv6only = saved_ipv6only;
+				inet_reset_saddr(sk);
+				goto out;
+			}
 		}
 	}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b3643e27e264..6e5e7caa3739 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1994,10 +1994,11 @@ union bpf_attr {
  *
  * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
  * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
- * 		**AF_INET6**). Looking for a free port to bind to can be
- * 		expensive, therefore binding to port is not permitted by the
- * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
- * 		must be set to zero.
+ * 		**AF_INET6**). It's advised to pass zero port (**sin_port**
+ * 		or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
+ * 		behavior and lets the kernel efficiently pick up an unused
+ * 		port as long as 4-tuple is unique. Passing non-zero port might
+ * 		lead to degraded performance.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
new file mode 100644
index 000000000000..47fbb20cb6a6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+static int verify_port(int family, int fd, int expected)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	__u16 port;
+
+	if (getsockname(fd, (struct sockaddr *)&addr, &len)) {
+		log_err("Failed to get server addr");
+		return -1;
+	}
+
+	if (family == AF_INET)
+		port = ((struct sockaddr_in *)&addr)->sin_port;
+	else
+		port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+	if (ntohs(port) != expected) {
+		log_err("Unexpected port %d, expected %d", ntohs(port),
+			expected);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int run_test(int cgroup_fd, int server_fd, int family, int type)
+{
+	struct bpf_prog_load_attr attr = {
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	};
+	struct bpf_object *obj;
+	int expected_port;
+	int prog_fd;
+	int err;
+	int fd;
+
+	if (family == AF_INET) {
+		attr.file = "./connect_force_port4.o";
+		attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
+		expected_port = 22222;
+	} else {
+		attr.file = "./connect_force_port6.o";
+		attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
+		expected_port = 22223;
+	}
+
+	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+	if (err) {
+		log_err("Failed to load BPF object");
+		return -1;
+	}
+
+	err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
+			      0);
+	if (err) {
+		log_err("Failed to attach BPF program");
+		goto close_bpf_object;
+	}
+
+	fd = connect_to_fd(family, type, server_fd);
+	if (fd < 0) {
+		err = -1;
+		goto close_bpf_object;
+	}
+
+	err = verify_port(family, fd, expected_port);
+
+	close(fd);
+
+close_bpf_object:
+	bpf_object__close(obj);
+	return err;
+}
+
+void test_connect_force_port(void)
+{
+	int server_fd, cgroup_fd;
+
+	cgroup_fd = test__join_cgroup("/connect_force_port");
+	if (CHECK_FAIL(cgroup_fd < 0))
+		return;
+
+	server_fd = start_server(AF_INET, SOCK_STREAM);
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_cgroup_fd;
+	CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
+	close(server_fd);
+
+	server_fd = start_server(AF_INET6, SOCK_STREAM);
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_cgroup_fd;
+	CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
+	close(server_fd);
+
+	server_fd = start_server(AF_INET, SOCK_DGRAM);
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_cgroup_fd;
+	CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
+	close(server_fd);
+
+	server_fd = start_server(AF_INET6, SOCK_DGRAM);
+	if (CHECK_FAIL(server_fd < 0))
+		goto close_cgroup_fd;
+	CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
+	close(server_fd);
+
+close_cgroup_fd:
+	close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c
new file mode 100644
index 000000000000..1b8eb34b2db0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
+
+SEC("cgroup/connect4")
+int _connect4(struct bpf_sock_addr *ctx)
+{
+	struct sockaddr_in sa = {};
+
+	sa.sin_family = AF_INET;
+	sa.sin_port = bpf_htons(22222);
+	sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */
+
+	if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+		return 0;
+
+	return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c
new file mode 100644
index 000000000000..ae6f7d750b4c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
+
+SEC("cgroup/connect6")
+int _connect6(struct bpf_sock_addr *ctx)
+{
+	struct sockaddr_in6 sa = {};
+
+	sa.sin6_family = AF_INET6;
+	sa.sin6_port = bpf_htons(22223);
+	sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */
+
+	if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+		return 0;
+
+	return 1;
+}
-- 
cgit v1.2.3


From cf86a086a18095e33e0637cb78cda1fcf5280852 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 May 2020 18:58:10 -0700
Subject: net/dst: use a smaller percpu_counter batch for dst entries
 accounting

percpu_counter_add() uses a default batch size which is quite big
on platforms with 256 cpus. (2*256 -> 512)

This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2)
(131072 on servers with 256 cpus)

Reduce the batch size to something more reasonable, and
add logic to ip6_dst_gc() to call dst_entries_get_slow()
before calling the _very_ expensive fib6_run_gc() function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst_ops.h | 4 +++-
 net/core/dst.c        | 8 ++++----
 net/ipv6/route.c      | 3 +++
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 443863c7b8da..88ff7bb2bb9b 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(struct dst_ops *dst)
 	return percpu_counter_sum_positive(&dst->pcpuc_entries);
 }
 
+#define DST_PERCPU_COUNTER_BATCH 32
 static inline void dst_entries_add(struct dst_ops *dst, int val)
 {
-	percpu_counter_add(&dst->pcpuc_entries, val);
+	percpu_counter_add_batch(&dst->pcpuc_entries, val,
+				 DST_PERCPU_COUNTER_BATCH);
 }
 
 static inline int dst_entries_init(struct dst_ops *dst)
diff --git a/net/core/dst.c b/net/core/dst.c
index 193af526e908..d6b6ced0d451 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 {
 	struct dst_entry *dst;
 
-	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+	if (ops->gc &&
+	    !(flags & DST_NOCOUNT) &&
+	    dst_entries_get_fast(ops) > ops->gc_thresh) {
 		if (ops->gc(ops)) {
-			printk_ratelimited(KERN_NOTICE "Route cache is full: "
-					   "consider increasing sysctl "
-					   "net.ipv[4|6].route.max_size.\n");
+			pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
 			return NULL;
 		}
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1ff142393c76..a9072dba00f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3195,6 +3195,9 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int entries;
 
 	entries = dst_entries_get_fast(ops);
+	if (entries > rt_max_size)
+		entries = dst_entries_get_slow(ops);
+
 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
 	    entries <= rt_max_size)
 		goto out;
-- 
cgit v1.2.3


From 3712c1c2ef2a0cbca633d150e568deb8f63a6f29 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Fri, 8 May 2020 11:52:08 +0800
Subject: ieee802154: 6lowpan: remove unnecessary comparison

The type of dispatch is u8 which is always '<=' 0xff, so the
dispatch <= 0xff is always true, we can remove this comparison.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ieee802154/6lowpan/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index ee179380a766..b34d050c9687 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -240,7 +240,7 @@ static inline bool lowpan_is_reserved(u8 dispatch)
 	return ((dispatch >= 0x44 && dispatch <= 0x4F) ||
 		(dispatch >= 0x51 && dispatch <= 0x5F) ||
 		(dispatch >= 0xc8 && dispatch <= 0xdf) ||
-		(dispatch >= 0xe8 && dispatch <= 0xff));
+		dispatch >= 0xe8);
 }
 
 /* lowpan_rx_h_check checks on generic 6LoWPAN requirements
-- 
cgit v1.2.3


From d8882935fcae28bceb5f6f56f09cded8d36d85e6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 May 2020 07:34:14 -0700
Subject: ipv6: use DST_NOCOUNT in ip6_rt_pcpu_alloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently have to adjust ipv6 route gc_thresh/max_size depending
on number of cpus on a server, this makes very little sense.

If the kernels sets /proc/sys/net/ipv6/route/gc_thresh to 1024
and /proc/sys/net/ipv6/route/max_size to 4096, then we better
not track the percpu dst that our implementation uses.

Only routes not added (directly or indirectly) by the admin
should be tracked and limited.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: David Ahern <dsahern@kernel.org>
Cc: Maciej Żenczykowski <maze@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a9072dba00f4..4292653af533 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1377,7 +1377,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(res);
-	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
+	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
 	rcu_read_unlock();
 	if (!pcpu_rt) {
 		fib6_info_release(f6i);
-- 
cgit v1.2.3


From 138d0be35b141e09f6b267c6ae4094318d4e4491 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 9 May 2020 10:59:10 -0700
Subject: net: bpf: Add netlink and ipv6_route bpf_iter targets

This patch added netlink and ipv6_route targets, using
the same seq_ops (except show() and minor changes for stop())
for /proc/net/{netlink,ipv6_route}.

The net namespace for these targets are the current net
namespace at file open stage, similar to
/proc/net/{netlink,ipv6_route} reference counting
the net namespace at seq_file open stage.

Since module is not supported for now, ipv6_route is
supported only if the IPV6 is built-in, i.e., not compiled
as a module. The restriction can be lifted once module
is properly supported for bpf_iter.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200509175910.2476329-1-yhs@fb.com
---
 fs/proc/proc_net.c       | 19 +++++++++++
 include/linux/proc_fs.h  |  3 ++
 net/ipv6/ip6_fib.c       | 65 ++++++++++++++++++++++++++++++++++--
 net/ipv6/route.c         | 37 ++++++++++++++++++++
 net/netlink/af_netlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 207 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4888c5224442..dba63b2429f0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = {
 	.proc_release	= seq_release_net,
 };
 
+int bpf_iter_init_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	p->net = get_net(current->nsproxy->net_ns);
+#endif
+	return 0;
+}
+
+void bpf_iter_fini_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+	struct seq_net_private *p = priv_data;
+
+	put_net(p->net);
+#endif
+}
+
 struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const struct seq_operations *ops,
 		unsigned int state_size, void *data)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 45c05fd9c99d..03953c59807d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+extern int bpf_iter_init_seq_net(void *priv_data);
+extern void bpf_iter_fini_seq_net(void *priv_data);
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 46ed56719476..a1fcc0ca21af 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void)
 }
 
 #ifdef CONFIG_PROC_FS
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
@@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 	return w->node && !(w->state == FWS_U && w->node == w->root);
 }
 
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU_BH)
 {
 	struct net *net = seq_file_net(seq);
@@ -2637,6 +2637,67 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock_bh();
 }
 
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__ipv6_route {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct fib6_info *, rt);
+};
+
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+				    struct bpf_iter_meta *meta,
+				    void *v)
+{
+	struct bpf_iter__ipv6_route ctx;
+
+	ctx.meta = meta;
+	ctx.rt = v;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	struct ipv6_route_iter *iter = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return ipv6_route_native_seq_show(seq, v);
+
+	ret = ipv6_route_prog_seq_show(prog, &meta, v);
+	iter->w.leaf = NULL;
+
+	return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)ipv6_route_prog_seq_show(prog, &meta, v);
+	}
+
+	ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+	return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+	ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
 const struct seq_operations ipv6_route_seq_ops = {
 	.start	= ipv6_route_seq_start,
 	.next	= ipv6_route_seq_next,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3912aac7854d..25f6d3e619d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6393,6 +6393,30 @@ void __init ip6_route_init_special_entries(void)
   #endif
 }
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "ipv6_route",
+		.seq_ops		= &ipv6_route_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct ipv6_route_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+static void bpf_iter_unregister(void)
+{
+	bpf_iter_unreg_target("ipv6_route");
+}
+#endif
+#endif
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -6455,6 +6479,14 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_register_late_subsys;
 
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	ret = bpf_iter_register();
+	if (ret)
+		goto out_register_late_subsys;
+#endif
+#endif
+
 	for_each_possible_cpu(cpu) {
 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 
@@ -6487,6 +6519,11 @@ out_kmem_cache:
 
 void ip6_route_cleanup(void)
 {
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_unregister();
+#endif
+#endif
 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
 	unregister_pernet_subsys(&ip6_route_net_late_ops);
 	fib6_rules_cleanup();
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5ded01ca8b20..33cda9baa979 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return __netlink_seq_next(seq);
 }
 
-static void netlink_seq_stop(struct seq_file *seq, void *v)
+static void netlink_native_seq_stop(struct seq_file *seq, void *v)
 {
 	struct nl_seq_iter *iter = seq->private;
 
@@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v)
 }
 
 
-static int netlink_seq_show(struct seq_file *seq, void *v)
+static int netlink_native_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__netlink {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct netlink_sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
+
+static int netlink_prog_seq_show(struct bpf_prog *prog,
+				  struct bpf_iter_meta *meta,
+				  void *v)
+{
+	struct bpf_iter__netlink ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk = nlk_sk((struct sock *)v);
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (!prog)
+		return netlink_native_seq_show(seq, v);
+
+	if (v != SEQ_START_TOKEN)
+		return netlink_prog_seq_show(prog, &meta, v);
+
+	return 0;
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)netlink_prog_seq_show(prog, &meta, v);
+	}
+
+	netlink_native_seq_stop(seq, v);
+}
+#else
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	return netlink_native_seq_show(seq, v);
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+	netlink_native_seq_stop(seq, v);
+}
+#endif
+
 static const struct seq_operations netlink_seq_ops = {
 	.start  = netlink_seq_start,
 	.next   = netlink_seq_next,
@@ -2740,6 +2802,21 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 	.automatic_shrinking = true,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+static int __init bpf_iter_register(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "netlink",
+		.seq_ops		= &netlink_seq_ops,
+		.init_seq_private	= bpf_iter_init_seq_net,
+		.fini_seq_private	= bpf_iter_fini_seq_net,
+		.seq_priv_size		= sizeof(struct nl_seq_iter),
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+#endif
+
 static int __init netlink_proto_init(void)
 {
 	int i;
@@ -2748,6 +2825,12 @@ static int __init netlink_proto_init(void)
 	if (err != 0)
 		goto out;
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	err = bpf_iter_register();
+	if (err)
+		goto out;
+#endif
+
 	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
 
 	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
-- 
cgit v1.2.3


From 11ca3c4261cdb4e2f33e32daf6447f8185843317 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 10 May 2020 21:12:33 +0200
Subject: net: ethtool: netlink: Add support for triggering a cable test

Add new ethtool netlink calls to trigger the starting of a PHY cable
test.

Add Kconfig'ury to ETHTOOL_NETLINK so that PHYLIB is not a module when
ETHTOOL_NETLINK is builtin, which would result in kernel linking errors.

v2:
Remove unwanted white space change
Remove ethnl_cable_test_act_ops and use doit handler
Rename cable_test_set_policy cable_test_act_policy
Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY

v3:
Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY from documentation
Remove unused cable_test_get_policy
Add Reviewed-by tags

v4:
Remove unwanted blank line

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 16 ++++++++-
 include/uapi/linux/ethtool_netlink.h         | 12 +++++++
 net/Kconfig                                  |  1 +
 net/ethtool/Makefile                         |  2 +-
 net/ethtool/cabletest.c                      | 54 ++++++++++++++++++++++++++++
 net/ethtool/netlink.c                        |  5 +++
 net/ethtool/netlink.h                        |  1 +
 7 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 net/ethtool/cabletest.c

(limited to 'net')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 8f5cefc539cf..a8731d33d0c9 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -204,6 +204,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_EEE_GET``               get EEE settings
   ``ETHTOOL_MSG_EEE_SET``               set EEE settings
   ``ETHTOOL_MSG_TSINFO_GET``		get timestamping info
+  ``ETHTOOL_MSG_CABLE_TEST_ACT``        action start cable test
   ===================================== ================================
 
 Kernel to userspace:
@@ -958,13 +959,25 @@ Kernel response contents:
 is no special value for this case). The bitset attributes are omitted if they
 would be empty (no bit set).
 
+CABLE_TEST
+==========
+
+Start a cable test.
+
+Request contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_CABLE_TEST_HEADER``       nested  request header
+  ====================================  ======  ==========================
+
 
 Request translation
 ===================
 
 The following table maps ioctl commands to netlink commands providing their
 functionality. Entries with "n/a" in right column are commands which do not
-have their netlink replacement yet.
+have their netlink replacement yet. Entries which "n/a" in the left column
+are netlink only.
 
   =================================== =====================================
   ioctl command                       netlink command
@@ -1053,4 +1066,5 @@ have their netlink replacement yet.
   ``ETHTOOL_PHY_STUNABLE``            n/a
   ``ETHTOOL_GFECPARAM``               n/a
   ``ETHTOOL_SFECPARAM``               n/a
+  n/a                                 ''ETHTOOL_MSG_CABLE_TEST_ACT''
   =================================== =====================================
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index bf1d310e20bc..6bfd648c32cf 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -39,6 +39,7 @@ enum {
 	ETHTOOL_MSG_EEE_GET,
 	ETHTOOL_MSG_EEE_SET,
 	ETHTOOL_MSG_TSINFO_GET,
+	ETHTOOL_MSG_CABLE_TEST_ACT,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -405,6 +406,17 @@ enum {
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+/* CABLE TEST */
+
+enum {
+	ETHTOOL_A_CABLE_TEST_UNSPEC,
+	ETHTOOL_A_CABLE_TEST_HEADER,		/* nest - _A_HEADER_* */
+
+	/* add new constants above here */
+	__ETHTOOL_A_CABLE_TEST_CNT,
+	ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/Kconfig b/net/Kconfig
index c5ba2d180c43..5c524c6ee75d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -455,6 +455,7 @@ config FAILOVER
 config ETHTOOL_NETLINK
 	bool "Netlink interface for ethtool"
 	default y
+	depends on PHYLIB=y || PHYLIB=n
 	help
 	  An alternative userspace interface for ethtool based on generic
 	  netlink. It provides better extensibility and some new features,
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 6c360c9c9370..0c2b94f20499 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
-		   channels.o coalesce.o pause.o eee.o tsinfo.o
+		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
new file mode 100644
index 000000000000..aeb6672a46d0
--- /dev/null
+++ b/net/ethtool/cabletest.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include "netlink.h"
+#include "common.h"
+
+/* CABLE_TEST_ACT */
+
+static const struct nla_policy
+cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = {
+	[ETHTOOL_A_CABLE_TEST_UNSPEC]		= { .type = NLA_REJECT },
+	[ETHTOOL_A_CABLE_TEST_HEADER]		= { .type = NLA_NESTED },
+};
+
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1];
+	struct ethnl_req_info req_info = {};
+	struct net_device *dev;
+	int ret;
+
+	ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+			  ETHTOOL_A_CABLE_TEST_MAX,
+			  cable_test_act_policy, info->extack);
+	if (ret < 0)
+		return ret;
+
+	ret = ethnl_parse_header_dev_get(&req_info,
+					 tb[ETHTOOL_A_CABLE_TEST_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+
+	dev = req_info.dev;
+	if (!dev->phydev) {
+		ret = -EOPNOTSUPP;
+		goto out_dev_put;
+	}
+
+	rtnl_lock();
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto out_rtnl;
+
+	ret = phy_start_cable_test(dev->phydev, info->extack);
+
+	ethnl_ops_complete(dev);
+out_rtnl:
+	rtnl_unlock();
+out_dev_put:
+	dev_put(dev);
+	return ret;
+}
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 0c772318c023..b9c9ddf408fe 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -839,6 +839,11 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.dumpit	= ethnl_default_dumpit,
 		.done	= ethnl_default_done,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_CABLE_TEST_ACT,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_act_cable_test,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 81b8fa020bcb..bd7df592db2f 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -357,5 +357,6 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
-- 
cgit v1.2.3


From 0df960f14e17e55e68dfd1342f063d17dbcc6107 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 10 May 2020 21:12:35 +0200
Subject: net: ethtool: Make helpers public

Make some helpers for building ethtool netlink messages available
outside the compilation unit, so they can be used for building
messages which are not simple get/set.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ethtool/netlink.c | 4 ++--
 net/ethtool/netlink.h | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index b9c9ddf408fe..87bc02da74bc 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -181,13 +181,13 @@ err:
 	return NULL;
 }
 
-static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
 {
 	return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0,
 			   cmd);
 }
 
-static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
 {
 	return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb,
 				       0, ETHNL_MCGRP_MONITOR, GFP_KERNEL);
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index bd7df592db2f..b0eb5d920099 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -19,6 +19,8 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
 struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
 				 u16 hdr_attrtype, struct genl_info *info,
 				 void **ehdrp);
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd);
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev);
 
 /**
  * ethnl_strz_size() - calculate attribute length for fixed size string
-- 
cgit v1.2.3


From 1dd3f212af30b42c90ba252c165f2f6d2ddf5230 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 10 May 2020 21:12:36 +0200
Subject: net: ethtool: Add infrastructure for reporting cable test results

Provide infrastructure for PHY drivers to report the cable test
results.  A netlink skb is associated to the phydev. Helpers will be
added which can add results to this skb. Once the test has finished
the results are sent to user space.

When netlink ethtool is not part of the kernel configuration stubs are
provided. It is also impossible to trigger a cable test, so the error
code returned by the alloc function is of no consequence.

v2:
Include the status complete in the netlink notification message

v4:
Replace -EINVAL with -EMSGSIZE

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c           | 22 +++++++++++++++--
 include/linux/ethtool_netlink.h | 20 +++++++++++++++
 include/linux/phy.h             |  5 ++++
 net/ethtool/cabletest.c         | 55 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 9fa61019533f..afdc1c2146ee 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
 #include <linux/sfp.h>
@@ -30,6 +31,9 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
 
 #define PHY_STATE_TIME	HZ
 
@@ -478,6 +482,8 @@ static void phy_abort_cable_test(struct phy_device *phydev)
 {
 	int err;
 
+	ethnl_cable_test_finished(phydev);
+
 	err = phy_init_hw(phydev);
 	if (err)
 		phydev_err(phydev, "Error while aborting cable test");
@@ -486,7 +492,7 @@ static void phy_abort_cable_test(struct phy_device *phydev)
 int phy_start_cable_test(struct phy_device *phydev,
 			 struct netlink_ext_ack *extack)
 {
-	int err;
+	int err = -ENOMEM;
 
 	if (!(phydev->drv &&
 	      phydev->drv->cable_test_start &&
@@ -512,19 +518,30 @@ int phy_start_cable_test(struct phy_device *phydev,
 		goto out;
 	}
 
+	err = ethnl_cable_test_alloc(phydev);
+	if (err)
+		goto out;
+
 	/* Mark the carrier down until the test is complete */
 	phy_link_down(phydev, true);
 
 	err = phydev->drv->cable_test_start(phydev);
 	if (err) {
 		phy_link_up(phydev);
-		goto out;
+		goto out_free;
 	}
 
 	phydev->state = PHY_CABLETEST;
 
 	if (phy_polling_mode(phydev))
 		phy_trigger_machine(phydev);
+
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+
+out_free:
+	ethnl_cable_test_free(phydev);
 out:
 	mutex_unlock(&phydev->lock);
 
@@ -964,6 +981,7 @@ void phy_state_machine(struct work_struct *work)
 		}
 
 		if (finished) {
+			ethnl_cable_test_finished(phydev);
 			needs_aneg = true;
 			phydev->state = PHY_UP;
 		}
diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index d01b77887f82..7d763ba22f6f 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -14,4 +14,24 @@ enum ethtool_multicast_groups {
 	ETHNL_MCGRP_MONITOR,
 };
 
+struct phy_device;
+
+#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
+int ethnl_cable_test_alloc(struct phy_device *phydev);
+void ethnl_cable_test_free(struct phy_device *phydev);
+void ethnl_cable_test_finished(struct phy_device *phydev);
+#else
+static inline int ethnl_cable_test_alloc(struct phy_device *phydev)
+{
+	return -ENOTSUPP;
+}
+
+static inline void ethnl_cable_test_free(struct phy_device *phydev)
+{
+}
+
+static inline void ethnl_cable_test_finished(struct phy_device *phydev)
+{
+}
+#endif /* IS_ENABLED(ETHTOOL_NETLINK) */
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f58eee735a45..169fae4249a9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -523,6 +523,11 @@ struct phy_device {
 	/* For use by PHYs inside the same package that need a shared state. */
 	struct phy_package_shared *shared;
 
+	/* Reporting cable test results */
+	struct sk_buff *skb;
+	void *ehdr;
+	struct nlattr *nest;
+
 	/* Interrupt and Polling infrastructure */
 	struct delayed_work state_queue;
 
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index aeb6672a46d0..ae8e63647663 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
 #include "netlink.h"
 #include "common.h"
 
@@ -52,3 +53,57 @@ out_dev_put:
 	dev_put(dev);
 	return ret;
 }
+
+int ethnl_cable_test_alloc(struct phy_device *phydev)
+{
+	int err = -ENOMEM;
+
+	phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!phydev->skb)
+		goto out;
+
+	phydev->ehdr = ethnl_bcastmsg_put(phydev->skb,
+					  ETHTOOL_MSG_CABLE_TEST_NTF);
+	if (!phydev->ehdr) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev,
+				      ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+	if (err)
+		goto out;
+
+	err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+			 ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED);
+	if (err)
+		goto out;
+
+	phydev->nest = nla_nest_start(phydev->skb,
+				      ETHTOOL_A_CABLE_TEST_NTF_NEST);
+	if (!phydev->nest)
+		goto out;
+
+	return 0;
+
+out:
+	nlmsg_free(phydev->skb);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc);
+
+void ethnl_cable_test_free(struct phy_device *phydev)
+{
+	nlmsg_free(phydev->skb);
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_free);
+
+void ethnl_cable_test_finished(struct phy_device *phydev)
+{
+	nla_nest_end(phydev->skb, phydev->nest);
+
+	genlmsg_end(phydev->skb, phydev->ehdr);
+
+	ethnl_multicast(phydev->skb, phydev->attached_dev);
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_finished);
-- 
cgit v1.2.3


From 1e2dc14509fd072739e4bab98ac42317267dbad6 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 10 May 2020 21:12:37 +0200
Subject: net: ethtool: Add helpers for reporting test results

The PHY drivers can use these helpers for reporting the results. The
results get translated into netlink attributes which are added to the
pre-allocated skbuf.

v3:
Poison phydev->skb
Return -EMSGSIZE when ethnl_bcastmsg_put() fails
Return valid error code when nla_nest_start() fails
Use u8 for results
Actually put u32 length into message

v4:
s/ENOTSUPP/EOPNOTSUPP/g

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool_netlink.h | 15 +++++++++++-
 include/linux/phy.h             |  4 ++++
 net/ethtool/cabletest.c         | 53 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 7d763ba22f6f..e317fc99565e 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -20,10 +20,12 @@ struct phy_device;
 int ethnl_cable_test_alloc(struct phy_device *phydev);
 void ethnl_cable_test_free(struct phy_device *phydev);
 void ethnl_cable_test_finished(struct phy_device *phydev);
+int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result);
+int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm);
 #else
 static inline int ethnl_cable_test_alloc(struct phy_device *phydev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void ethnl_cable_test_free(struct phy_device *phydev)
@@ -33,5 +35,16 @@ static inline void ethnl_cable_test_free(struct phy_device *phydev)
 static inline void ethnl_cable_test_finished(struct phy_device *phydev)
 {
 }
+static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair,
+					  u8 result)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ethnl_cable_test_fault_length(struct phy_device *phydev,
+						u8 pair, u32 cm)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* IS_ENABLED(ETHTOOL_NETLINK) */
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 169fae4249a9..5d8ff5428010 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1265,6 +1265,10 @@ int phy_start_cable_test(struct phy_device *phydev,
 }
 #endif
 
+int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result);
+int phy_cable_test_fault_length(struct phy_device *phydev, u8 pair,
+				u16 cm);
+
 static inline void phy_device_reset(struct phy_device *phydev, int value)
 {
 	mdio_device_reset(&phydev->mdio, value);
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index ae8e63647663..e0c917918c70 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -81,13 +81,16 @@ int ethnl_cable_test_alloc(struct phy_device *phydev)
 
 	phydev->nest = nla_nest_start(phydev->skb,
 				      ETHTOOL_A_CABLE_TEST_NTF_NEST);
-	if (!phydev->nest)
+	if (!phydev->nest) {
+		err = -EMSGSIZE;
 		goto out;
+	}
 
 	return 0;
 
 out:
 	nlmsg_free(phydev->skb);
+	phydev->skb = NULL;
 	return err;
 }
 EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc);
@@ -95,6 +98,7 @@ EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc);
 void ethnl_cable_test_free(struct phy_device *phydev)
 {
 	nlmsg_free(phydev->skb);
+	phydev->skb = NULL;
 }
 EXPORT_SYMBOL_GPL(ethnl_cable_test_free);
 
@@ -107,3 +111,50 @@ void ethnl_cable_test_finished(struct phy_device *phydev)
 	ethnl_multicast(phydev->skb, phydev->attached_dev);
 }
 EXPORT_SYMBOL_GPL(ethnl_cable_test_finished);
+
+int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result)
+{
+	struct nlattr *nest;
+	int ret = -EMSGSIZE;
+
+	nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair))
+		goto err;
+	if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result))
+		goto err;
+
+	nla_nest_end(phydev->skb, nest);
+	return 0;
+
+err:
+	nla_nest_cancel(phydev->skb, nest);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_result);
+
+int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm)
+{
+	struct nlattr *nest;
+	int ret = -EMSGSIZE;
+
+	nest = nla_nest_start(phydev->skb,
+			      ETHTOOL_A_CABLE_NEST_FAULT_LENGTH);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair))
+		goto err;
+	if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm))
+		goto err;
+
+	nla_nest_end(phydev->skb, nest);
+	return 0;
+
+err:
+	nla_nest_cancel(phydev->skb, nest);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length);
-- 
cgit v1.2.3


From 9896a4574ecb137d4e5b9283004aa34c688bc761 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 10 May 2020 21:12:40 +0200
Subject: net: phy: Send notifier when starting the cable test

Given that it takes time to run a cable test, send a notify message at
the start, as well as when it is completed.

v3:
EMSGSIZE when ethnl_bcastmsg_put() fails
Print an error message on failure, since this is a void function.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ethtool/cabletest.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'net')

diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index e0c917918c70..5ba06eabe8c2 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -13,6 +13,43 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = {
 	[ETHTOOL_A_CABLE_TEST_HEADER]		= { .type = NLA_NESTED },
 };
 
+static int ethnl_cable_test_started(struct phy_device *phydev)
+{
+	struct sk_buff *skb;
+	int err = -ENOMEM;
+	void *ehdr;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF);
+	if (!ehdr) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	err = ethnl_fill_reply_header(skb, phydev->attached_dev,
+				      ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+	if (err)
+		goto out;
+
+	err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+			 ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED);
+	if (err)
+		goto out;
+
+	genlmsg_end(skb, ehdr);
+
+	return ethnl_multicast(skb, phydev->attached_dev);
+
+out:
+	nlmsg_free(skb);
+	phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err));
+
+	return err;
+}
+
 int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1];
@@ -47,6 +84,10 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
 	ret = phy_start_cable_test(dev->phydev, info->extack);
 
 	ethnl_ops_complete(dev);
+
+	if (!ret)
+		ethnl_cable_test_started(dev->phydev);
+
 out_rtnl:
 	rtnl_unlock();
 out_dev_put:
-- 
cgit v1.2.3


From 9eb8eff0cf2f1e1afc0756bb30cb9746ba90dd07 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 10 May 2020 19:37:40 +0300
Subject: net: bridge: allow enslaving some DSA master network devices

Commit 8db0a2ee2c63 ("net: bridge: reject DSA-enabled master netdevices
as bridge members") added a special check in br_if.c in order to check
for a DSA master network device with a tagging protocol configured. This
was done because back then, such devices, once enslaved in a bridge
would become inoperative and would not pass DSA tagged traffic anymore
due to br_handle_frame returning RX_HANDLER_CONSUMED.

But right now we have valid use cases which do require bridging of DSA
masters. One such example is when the DSA master ports are DSA switch
ports themselves (in a disjoint tree setup). This should be completely
equivalent, functionally speaking, from having multiple DSA switches
hanging off of the ports of a switchdev driver. So we should allow the
enslaving of DSA tagged master network devices.

Instead of the regular br_handle_frame(), install a new function
br_handle_frame_dummy() on these DSA masters, which returns
RX_HANDLER_PASS in order to call into the DSA specific tagging protocol
handlers, and lift the restriction from br_add_if.

Suggested-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h       |  2 +-
 net/bridge/br_if.c      | 32 +++++++++++++++++++++++---------
 net/bridge/br_input.c   | 23 ++++++++++++++++++++++-
 net/bridge/br_private.h |  6 +++---
 4 files changed, 49 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6dfc8c2f68b8..02fb5025e0ac 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -651,7 +651,7 @@ struct dsa_switch_driver {
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
 /* Keep inline for faster access in hot path */
-static inline bool netdev_uses_dsa(struct net_device *dev)
+static inline bool netdev_uses_dsa(const struct net_device *dev)
 {
 #if IS_ENABLED(CONFIG_NET_DSA)
 	return dev->dsa_ptr && dev->dsa_ptr->rcv;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ca685c0cdf95..a0e9a7937412 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -563,18 +563,32 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	unsigned br_hr, dev_hr;
 	bool changed_addr;
 
-	/* Don't allow bridging non-ethernet like devices, or DSA-enabled
-	 * master network devices since the bridge layer rx_handler prevents
-	 * the DSA fake ethertype handler to be invoked, so we do not strip off
-	 * the DSA switch tag protocol header and the bridge layer just return
-	 * RX_HANDLER_CONSUMED, stopping RX processing for these frames.
-	 */
+	/* Don't allow bridging non-ethernet like devices. */
 	if ((dev->flags & IFF_LOOPBACK) ||
 	    dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
-	    !is_valid_ether_addr(dev->dev_addr) ||
-	    netdev_uses_dsa(dev))
+	    !is_valid_ether_addr(dev->dev_addr))
 		return -EINVAL;
 
+	/* Also don't allow bridging of net devices that are DSA masters, since
+	 * the bridge layer rx_handler prevents the DSA fake ethertype handler
+	 * to be invoked, so we don't get the chance to strip off and parse the
+	 * DSA switch tag protocol header (the bridge layer just returns
+	 * RX_HANDLER_CONSUMED, stopping RX processing for these frames).
+	 * The only case where that would not be an issue is when bridging can
+	 * already be offloaded, such as when the DSA master is itself a DSA
+	 * or plain switchdev port, and is bridged only with other ports from
+	 * the same hardware device.
+	 */
+	if (netdev_uses_dsa(dev)) {
+		list_for_each_entry(p, &br->port_list, list) {
+			if (!netdev_port_same_parent_id(dev, p->dev)) {
+				NL_SET_ERR_MSG(extack,
+					       "Cannot do software bridging with a DSA master");
+				return -EINVAL;
+			}
+		}
+	}
+
 	/* No bridging of bridges */
 	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
 		NL_SET_ERR_MSG(extack,
@@ -618,7 +632,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	if (err)
 		goto err3;
 
-	err = netdev_rx_handler_register(dev, br_handle_frame, p);
+	err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);
 	if (err)
 		goto err4;
 
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d5c34f36f0f4..59a318b9f646 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -17,6 +17,7 @@
 #endif
 #include <linux/neighbour.h>
 #include <net/arp.h>
+#include <net/dsa.h>
 #include <linux/export.h>
 #include <linux/rculist.h>
 #include "br_private.h"
@@ -257,7 +258,7 @@ frame_finish:
  * Return NULL if skb is handled
  * note: already called with rcu_read_lock
  */
-rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
+static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 {
 	struct net_bridge_port *p;
 	struct sk_buff *skb = *pskb;
@@ -359,3 +360,23 @@ drop:
 	}
 	return RX_HANDLER_CONSUMED;
 }
+
+/* This function has no purpose other than to appease the br_port_get_rcu/rtnl
+ * helpers which identify bridged ports according to the rx_handler installed
+ * on them (so there _needs_ to be a bridge rx_handler even if we don't need it
+ * to do anything useful). This bridge won't support traffic to/from the stack,
+ * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal
+ * frames from the ETH_P_XDSA packet_type handler.
+ */
+static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb)
+{
+	return RX_HANDLER_PASS;
+}
+
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
+{
+	if (netdev_uses_dsa(dev))
+		return br_handle_frame_dummy;
+
+	return br_handle_frame;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 4dc21e8f7e33..7501be4eeba0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -702,16 +702,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
 
 /* br_input.c */
 int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
-rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev);
 
 static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
 {
-	return rcu_dereference(dev->rx_handler) == br_handle_frame;
+	return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev);
 }
 
 static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
 {
-	return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+	return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev);
 }
 
 static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
-- 
cgit v1.2.3


From f66a6a69f97a24546664541237a82b288c2713f6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 10 May 2020 19:37:41 +0300
Subject: net: dsa: permit cross-chip bridging between all trees in the system

One way of utilizing DSA is by cascading switches which do not all have
compatible taggers. Consider the following real-life topology:

      +---------------------------------------------------------------+
      | LS1028A                                                       |
      |               +------------------------------+                |
      |               |      DSA master for Felix    |                |
      |               |(internal ENETC port 2: eno2))|                |
      |  +------------+------------------------------+-------------+  |
      |  | Felix embedded L2 switch                                |  |
      |  |                                                         |  |
      |  | +--------------+   +--------------+   +--------------+  |  |
      |  | |DSA master for|   |DSA master for|   |DSA master for|  |  |
      |  | |  SJA1105 1   |   |  SJA1105 2   |   |  SJA1105 3   |  |  |
      |  | |(Felix port 1)|   |(Felix port 2)|   |(Felix port 3)|  |  |
      +--+-+--------------+---+--------------+---+--------------+--+--+

+-----------------------+ +-----------------------+ +-----------------------+
|   SJA1105 switch 1    | |   SJA1105 switch 2    | |   SJA1105 switch 3    |
+-----+-----+-----+-----+ +-----+-----+-----+-----+ +-----+-----+-----+-----+
|sw1p0|sw1p1|sw1p2|sw1p3| |sw2p0|sw2p1|sw2p2|sw2p3| |sw3p0|sw3p1|sw3p2|sw3p3|
+-----+-----+-----+-----+ +-----+-----+-----+-----+ +-----+-----+-----+-----+

The above can be described in the device tree as follows (obviously not
complete):

mscc_felix {
	dsa,member = <0 0>;
	ports {
		port@4 {
			ethernet = <&enetc_port2>;
		};
	};
};

sja1105_switch1 {
	dsa,member = <1 1>;
	ports {
		port@4 {
			ethernet = <&mscc_felix_port1>;
		};
	};
};

sja1105_switch2 {
	dsa,member = <2 2>;
	ports {
		port@4 {
			ethernet = <&mscc_felix_port2>;
		};
	};
};

sja1105_switch3 {
	dsa,member = <3 3>;
	ports {
		port@4 {
			ethernet = <&mscc_felix_port3>;
		};
	};
};

Basically we instantiate one DSA switch tree for every hardware switch
in the system, but we still give them globally unique switch IDs (will
come back to that later). Having 3 disjoint switch trees makes the
tagger drivers "just work", because net devices are registered for the
3 Felix DSA master ports, and they are also DSA slave ports to the ENETC
port. So packets received on the ENETC port are stripped of their
stacked DSA tags one by one.

Currently, hardware bridging between ports on the same sja1105 chip is
possible, but switching between sja1105 ports on different chips is
handled by the software bridge. This is fine, but we can do better.

In fact, the dsa_8021q tag used by sja1105 is compatible with cascading.
In other words, a sja1105 switch can correctly parse and route a packet
containing a dsa_8021q tag. So if we could enable hardware bridging on
the Felix DSA master ports, cross-chip bridging could be completely
offloaded.

Such as system would be used as follows:

ip link add dev br0 type bridge && ip link set dev br0 up
for port in sw0p0 sw0p1 sw0p2 sw0p3 \
	    sw1p0 sw1p1 sw1p2 sw1p3 \
	    sw2p0 sw2p1 sw2p2 sw2p3; do
	ip link set dev $port master br0
done

The above makes switching between ports on the same row be performed in
hardware, and between ports on different rows in software. Now assume
the Felix switch ports are called swp0, swp1, swp2. By running the
following extra commands:

ip link add dev br1 type bridge && ip link set dev br1 up
for port in swp0 swp1 swp2; do
	ip link set dev $port master br1
done

the CPU no longer sees packets which traverse sja1105 switch boundaries
and can be forwarded directly by Felix. The br1 bridge would not be used
for any sort of traffic termination.

For this to work, we need to give drivers an opportunity to listen for
bridging events on DSA trees other than their own, and pass that other
tree index as argument. I have made the assumption, for the moment, that
the other existing DSA notifiers don't need to be broadcast to other
trees. That assumption might turn out to be incorrect. But in the
meantime, introduce a dsa_broadcast function, similar in purpose to
dsa_port_notify, which is used only by the bridging notifiers.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 16 ++++++++++++----
 include/net/dsa.h                | 10 ++++++----
 net/dsa/dsa_priv.h               |  1 +
 net/dsa/port.c                   | 23 +++++++++++++++++++++--
 net/dsa/switch.c                 | 21 +++++++++++++++------
 5 files changed, 55 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2b4a723c8306..7627ea61e0ea 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2233,26 +2233,34 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
 	mv88e6xxx_reg_unlock(chip);
 }
 
-static int mv88e6xxx_crosschip_bridge_join(struct dsa_switch *ds, int dev,
+static int mv88e6xxx_crosschip_bridge_join(struct dsa_switch *ds,
+					   int tree_index, int sw_index,
 					   int port, struct net_device *br)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
 
+	if (tree_index != ds->dst->index)
+		return 0;
+
 	mv88e6xxx_reg_lock(chip);
-	err = mv88e6xxx_pvt_map(chip, dev, port);
+	err = mv88e6xxx_pvt_map(chip, sw_index, port);
 	mv88e6xxx_reg_unlock(chip);
 
 	return err;
 }
 
-static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds, int dev,
+static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds,
+					     int tree_index, int sw_index,
 					     int port, struct net_device *br)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 
+	if (tree_index != ds->dst->index)
+		return;
+
 	mv88e6xxx_reg_lock(chip);
-	if (mv88e6xxx_pvt_map(chip, dev, port))
+	if (mv88e6xxx_pvt_map(chip, sw_index, port))
 		dev_err(ds->dev, "failed to remap cross-chip Port VLAN\n");
 	mv88e6xxx_reg_unlock(chip);
 }
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 02fb5025e0ac..0f4fc00239d9 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -574,10 +574,12 @@ struct dsa_switch_ops {
 	/*
 	 * Cross-chip operations
 	 */
-	int	(*crosschip_bridge_join)(struct dsa_switch *ds, int sw_index,
-					 int port, struct net_device *br);
-	void	(*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index,
-					  int port, struct net_device *br);
+	int	(*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index,
+					 int sw_index, int port,
+					 struct net_device *br);
+	void	(*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
+					  int sw_index, int port,
+					  struct net_device *br);
 
 	/*
 	 * PTP functionality
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6d9a1ef65fa0..a1a0ae242012 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -35,6 +35,7 @@ struct dsa_notifier_ageing_time_info {
 /* DSA_NOTIFIER_BRIDGE_* */
 struct dsa_notifier_bridge_info {
 	struct net_device *br;
+	int tree_index;
 	int sw_index;
 	int port;
 };
diff --git a/net/dsa/port.c b/net/dsa/port.c
index a58fdd362574..ebc8d6cbd1d4 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -13,6 +13,23 @@
 
 #include "dsa_priv.h"
 
+static int dsa_broadcast(unsigned long e, void *v)
+{
+	struct dsa_switch_tree *dst;
+	int err = 0;
+
+	list_for_each_entry(dst, &dsa_tree_list, list) {
+		struct raw_notifier_head *nh = &dst->nh;
+
+		err = raw_notifier_call_chain(nh, e, v);
+		err = notifier_to_errno(err);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
 static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v)
 {
 	struct raw_notifier_head *nh = &dp->ds->dst->nh;
@@ -120,6 +137,7 @@ void dsa_port_disable(struct dsa_port *dp)
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
 {
 	struct dsa_notifier_bridge_info info = {
+		.tree_index = dp->ds->dst->index,
 		.sw_index = dp->ds->index,
 		.port = dp->index,
 		.br = br,
@@ -136,7 +154,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
 	 */
 	dp->bridge_dev = br;
 
-	err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info);
+	err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info);
 
 	/* The bridging is rolled back on error */
 	if (err) {
@@ -150,6 +168,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
 {
 	struct dsa_notifier_bridge_info info = {
+		.tree_index = dp->ds->dst->index,
 		.sw_index = dp->ds->index,
 		.port = dp->index,
 		.br = br,
@@ -161,7 +180,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
 	 */
 	dp->bridge_dev = NULL;
 
-	err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+	err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
 	if (err)
 		pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
 
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index f3c32ff552b3..86c8dc5c32a0 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -89,11 +89,16 @@ static int dsa_switch_mtu(struct dsa_switch *ds,
 static int dsa_switch_bridge_join(struct dsa_switch *ds,
 				  struct dsa_notifier_bridge_info *info)
 {
-	if (ds->index == info->sw_index && ds->ops->port_bridge_join)
+	struct dsa_switch_tree *dst = ds->dst;
+
+	if (dst->index == info->tree_index && ds->index == info->sw_index &&
+	    ds->ops->port_bridge_join)
 		return ds->ops->port_bridge_join(ds, info->port, info->br);
 
-	if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join)
-		return ds->ops->crosschip_bridge_join(ds, info->sw_index,
+	if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
+	    ds->ops->crosschip_bridge_join)
+		return ds->ops->crosschip_bridge_join(ds, info->tree_index,
+						      info->sw_index,
 						      info->port, info->br);
 
 	return 0;
@@ -103,13 +108,17 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 				   struct dsa_notifier_bridge_info *info)
 {
 	bool unset_vlan_filtering = br_vlan_enabled(info->br);
+	struct dsa_switch_tree *dst = ds->dst;
 	int err, i;
 
-	if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
+	if (dst->index == info->tree_index && ds->index == info->sw_index &&
+	    ds->ops->port_bridge_join)
 		ds->ops->port_bridge_leave(ds, info->port, info->br);
 
-	if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave)
-		ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port,
+	if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
+	    ds->ops->crosschip_bridge_join)
+		ds->ops->crosschip_bridge_leave(ds, info->tree_index,
+						info->sw_index, info->port,
 						info->br);
 
 	/* If the bridge was vlan_filtering, the bridge core doesn't trigger an
-- 
cgit v1.2.3


From 3b7bc1f09101ccace330d105c13c2946bf3be6d5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 10 May 2020 19:37:42 +0300
Subject: net: dsa: introduce a dsa_switch_find function

Somewhat similar to dsa_tree_find, dsa_switch_find returns a dsa_switch
structure pointer by searching for its tree index and switch index (the
parameters from dsa,member). To be used, for example, by drivers who
implement .crosschip_bridge_join and need a reference to the other
switch indicated to by the tree_index and sw_index arguments.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  1 +
 net/dsa/dsa2.c    | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0f4fc00239d9..312c2f067e65 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -672,6 +672,7 @@ static inline bool dsa_can_decode(const struct sk_buff *skb,
 
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
+struct dsa_switch *dsa_switch_find(int tree_index, int sw_index);
 #ifdef CONFIG_PM_SLEEP
 int dsa_switch_suspend(struct dsa_switch *ds);
 int dsa_switch_resume(struct dsa_switch *ds);
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index d90665b465b8..076908fdd29b 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -24,6 +24,27 @@ LIST_HEAD(dsa_tree_list);
 static const struct devlink_ops dsa_devlink_ops = {
 };
 
+struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
+{
+	struct dsa_switch_tree *dst;
+	struct dsa_port *dp;
+
+	list_for_each_entry(dst, &dsa_tree_list, list) {
+		if (dst->index != tree_index)
+			continue;
+
+		list_for_each_entry(dp, &dst->ports, list) {
+			if (dp->ds->index != sw_index)
+				continue;
+
+			return dp->ds;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_find);
+
 static struct dsa_switch_tree *dsa_tree_find(int index)
 {
 	struct dsa_switch_tree *dst;
-- 
cgit v1.2.3


From ac02a451a6148bb9c395b39783ce7299eddf4f31 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 10 May 2020 19:37:43 +0300
Subject: net: dsa: sja1105: implement cross-chip bridging operations

sja1105 uses dsa_8021q for DSA tagging, a format which is VLAN at heart
and which is compatible with cascading. A complete description of this
tagging format is in net/dsa/tag_8021q.c, but a quick summary is that
each external-facing port tags incoming frames with a unique pvid, and
this special VLAN is transmitted as tagged towards the inside of the
system, and as untagged towards the exterior. The tag encodes the switch
id and the source port index.

This means that cross-chip bridging for dsa_8021q only entails adding
the dsa_8021q pvids of one switch to the RX filter of the other
switches. Everything else falls naturally into place, as long as the
bottom-end of ports (the leaves in the tree) is comprised exclusively of
dsa_8021q-compatible (i.e. sja1105 switches). Otherwise, there would be
a chance that a front-panel switch transmits a packet tagged with a
dsa_8021q header, header which it wouldn't be able to remove, and which
would hence "leak" out.

The only use case I tested (due to lack of board availability) was when
the sja1105 switches are part of disjoint trees (however, this doesn't
change the fact that multiple sja1105 switches still need unique switch
identifiers in such a system). But in principle, even "true" single-tree
setups (with DSA links) should work just as fine, except for a small
change which I can't test: dsa_towards_port should be used instead of
dsa_upstream_port (I made the assumption that the routing port that any
sja1105 should use towards its neighbours is the CPU port. That might
not hold true in other setups).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/sja1105/sja1105.h      |   2 +
 drivers/net/dsa/sja1105/sja1105_main.c |  90 ++++++++++++++++++++
 include/linux/dsa/8021q.h              |  45 ++++++++++
 net/dsa/tag_8021q.c                    | 151 +++++++++++++++++++++++++++++++++
 4 files changed, 288 insertions(+)

(limited to 'net')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 8df2a5c53b02..a64ace07b89f 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -8,6 +8,7 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/timecounter.h>
 #include <linux/dsa/sja1105.h>
+#include <linux/dsa/8021q.h>
 #include <net/dsa.h>
 #include <linux/mutex.h>
 #include "sja1105_static_config.h"
@@ -185,6 +186,7 @@ struct sja1105_private {
 	struct gpio_desc *reset_gpio;
 	struct spi_device *spidev;
 	struct dsa_switch *ds;
+	struct list_head crosschip_links;
 	struct sja1105_flow_block flow_block;
 	struct sja1105_port ports[SJA1105_NUM_PORTS];
 	/* Serializes transmission of management frames so that
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 666e54565df0..d5de9305df25 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -25,6 +25,8 @@
 #include "sja1105_sgmii.h"
 #include "sja1105_tas.h"
 
+static const struct dsa_switch_ops sja1105_switch_ops;
+
 static void sja1105_hw_reset(struct gpio_desc *gpio, unsigned int pulse_len,
 			     unsigned int startup_delay)
 {
@@ -1791,6 +1793,84 @@ static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid,
 	return 0;
 }
 
+static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
+					 int tree_index, int sw_index,
+					 int other_port, struct net_device *br)
+{
+	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
+	struct sja1105_private *other_priv = other_ds->priv;
+	struct sja1105_private *priv = ds->priv;
+	int port, rc;
+
+	if (other_ds->ops != &sja1105_switch_ops)
+		return 0;
+
+	for (port = 0; port < ds->num_ports; port++) {
+		if (!dsa_is_user_port(ds, port))
+			continue;
+		if (dsa_to_port(ds, port)->bridge_dev != br)
+			continue;
+
+		rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds,
+						     other_port, br,
+						     &priv->crosschip_links);
+		if (rc)
+			return rc;
+
+		rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds,
+						     port, br,
+						     &other_priv->crosschip_links);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
+					   int tree_index, int sw_index,
+					   int other_port,
+					   struct net_device *br)
+{
+	struct dsa_switch *other_ds = dsa_switch_find(tree_index, sw_index);
+	struct sja1105_private *other_priv = other_ds->priv;
+	struct sja1105_private *priv = ds->priv;
+	int port;
+
+	if (other_ds->ops != &sja1105_switch_ops)
+		return;
+
+	for (port = 0; port < ds->num_ports; port++) {
+		if (!dsa_is_user_port(ds, port))
+			continue;
+		if (dsa_to_port(ds, port)->bridge_dev != br)
+			continue;
+
+		dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port,
+						 br, &priv->crosschip_links);
+
+		dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds,
+						 port, br,
+						 &other_priv->crosschip_links);
+	}
+}
+
+static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct dsa_8021q_crosschip_link *c;
+	int rc;
+
+	list_for_each_entry(c, &priv->crosschip_links, list) {
+		rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds,
+						    c->other_port, enabled);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
 static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
 {
 	int rc, i;
@@ -1803,6 +1883,12 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
 			return rc;
 		}
 	}
+	rc = sja1105_replay_crosschip_vlans(ds, enabled);
+	if (rc) {
+		dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc);
+		return rc;
+	}
+
 	dev_info(ds->dev, "%s switch tagging\n",
 		 enabled ? "Enabled" : "Disabled");
 	return 0;
@@ -2370,6 +2456,8 @@ static const struct dsa_switch_ops sja1105_switch_ops = {
 	.cls_flower_add		= sja1105_cls_flower_add,
 	.cls_flower_del		= sja1105_cls_flower_del,
 	.cls_flower_stats	= sja1105_cls_flower_stats,
+	.crosschip_bridge_join	= sja1105_crosschip_bridge_join,
+	.crosschip_bridge_leave	= sja1105_crosschip_bridge_leave,
 };
 
 static int sja1105_check_device_id(struct sja1105_private *priv)
@@ -2472,6 +2560,8 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->ptp_data.lock);
 	mutex_init(&priv->mgmt_lock);
 
+	INIT_LIST_HEAD(&priv->crosschip_links);
+
 	sja1105_tas_setup(ds);
 	sja1105_flower_setup(ds);
 
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index c620d9139c28..b8daaec0896e 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -12,11 +12,33 @@ struct sk_buff;
 struct net_device;
 struct packet_type;
 
+struct dsa_8021q_crosschip_link {
+	struct list_head list;
+	int port;
+	struct dsa_switch *other_ds;
+	int other_port;
+	refcount_t refcount;
+};
+
 #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q)
 
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 				 bool enabled);
 
+int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
+				   struct dsa_switch *other_ds,
+				   int other_port, bool enabled);
+
+int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
+				    struct dsa_switch *other_ds,
+				    int other_port, struct net_device *br,
+				    struct list_head *crosschip_links);
+
+int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
+				     struct dsa_switch *other_ds,
+				     int other_port, struct net_device *br,
+				     struct list_head *crosschip_links);
+
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
@@ -36,6 +58,29 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 	return 0;
 }
 
+int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
+				   struct dsa_switch *other_ds,
+				   int other_port, bool enabled)
+{
+	return 0;
+}
+
+int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
+				    struct dsa_switch *other_ds,
+				    int other_port, struct net_device *br,
+				    struct list_head *crosschip_links)
+{
+	return 0;
+}
+
+int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
+				     struct dsa_switch *other_ds,
+				     int other_port, struct net_device *br,
+				     struct list_head *crosschip_links)
+{
+	return 0;
+}
+
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci)
 {
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index b97ad93d1c1a..ff9c5bf64bda 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -8,6 +8,7 @@
  */
 #include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
+#include <linux/dsa/8021q.h>
 
 #include "dsa_priv.h"
 
@@ -288,6 +289,156 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
 }
 EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
 
+int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
+				   struct dsa_switch *other_ds,
+				   int other_port, bool enabled)
+{
+	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
+
+	/* @rx_vid of local @ds port @port goes to @other_port of
+	 * @other_ds
+	 */
+	return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
+				   BRIDGE_VLAN_INFO_UNTAGGED, enabled);
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply);
+
+static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
+					struct dsa_switch *other_ds,
+					int other_port,
+					struct list_head *crosschip_links)
+{
+	struct dsa_8021q_crosschip_link *c;
+
+	list_for_each_entry(c, crosschip_links, list) {
+		if (c->port == port && c->other_ds == other_ds &&
+		    c->other_port == other_port) {
+			refcount_inc(&c->refcount);
+			return 0;
+		}
+	}
+
+	dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n",
+		port, dev_name(other_ds->dev), other_port);
+
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	c->port = port;
+	c->other_ds = other_ds;
+	c->other_port = other_port;
+	refcount_set(&c->refcount, 1);
+
+	list_add(&c->list, crosschip_links);
+
+	return 0;
+}
+
+static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
+					 struct dsa_8021q_crosschip_link *c,
+					 struct list_head *crosschip_links,
+					 bool *keep)
+{
+	*keep = !refcount_dec_and_test(&c->refcount);
+
+	if (*keep)
+		return;
+
+	dev_dbg(ds->dev,
+		"deleting crosschip link from port %d to %s port %d\n",
+		c->port, dev_name(c->other_ds->dev), c->other_port);
+
+	list_del(&c->list);
+	kfree(c);
+}
+
+/* Make traffic from local port @port be received by remote port @other_port.
+ * This means that our @rx_vid needs to be installed on @other_ds's upstream
+ * and user ports. The user ports should be egress-untagged so that they can
+ * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged
+ * or untagged: it doesn't matter, since it should never egress a frame having
+ * our @rx_vid.
+ */
+int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
+				    struct dsa_switch *other_ds,
+				    int other_port, struct net_device *br,
+				    struct list_head *crosschip_links)
+{
+	/* @other_upstream is how @other_ds reaches us. If we are part
+	 * of disjoint trees, then we are probably connected through
+	 * our CPU ports. If we're part of the same tree though, we should
+	 * probably use dsa_towards_port.
+	 */
+	int other_upstream = dsa_upstream_port(other_ds, other_port);
+	int rc;
+
+	rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
+					  other_port, crosschip_links);
+	if (rc)
+		return rc;
+
+	if (!br_vlan_enabled(br)) {
+		rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+						    other_port, true);
+		if (rc)
+			return rc;
+	}
+
+	rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
+					  other_upstream,
+					  crosschip_links);
+	if (rc)
+		return rc;
+
+	if (!br_vlan_enabled(br)) {
+		rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+						    other_upstream, true);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
+
+int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
+				     struct dsa_switch *other_ds,
+				     int other_port, struct net_device *br,
+				     struct list_head *crosschip_links)
+{
+	int other_upstream = dsa_upstream_port(other_ds, other_port);
+	struct dsa_8021q_crosschip_link *c, *n;
+
+	list_for_each_entry_safe(c, n, crosschip_links, list) {
+		if (c->port == port && c->other_ds == other_ds &&
+		    (c->other_port == other_port ||
+		     c->other_port == other_upstream)) {
+			struct dsa_switch *other_ds = c->other_ds;
+			int other_port = c->other_port;
+			bool keep;
+			int rc;
+
+			dsa_8021q_crosschip_link_del(ds, c, crosschip_links,
+						     &keep);
+			if (keep)
+				continue;
+
+			if (!br_vlan_enabled(br)) {
+				rc = dsa_8021q_crosschip_link_apply(ds, port,
+								    other_ds,
+								    other_port,
+								    false);
+				if (rc)
+					return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave);
+
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci)
 {
-- 
cgit v1.2.3


From eaa7b7228ff7f8688623120bb0cd75d1490d5d04 Mon Sep 17 00:00:00 2001
From: Tedd Ho-Jeong An <tedd.an@intel.com>
Date: Fri, 1 May 2020 10:00:50 -0700
Subject: Bluetooth: Fix advertising handle is set to 0

This patch fix the advertising handle is set to 0 regardless of actual
instance value. The affected commands are LE Set Advertising Set Random
Address, LE Set Extended Advertising Data, and LE Set Extended Scan
Response Data commands.

Signed-off-by: Tedd Ho-Jeong An <tedd.an@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 9ea40106ef17..3f470f0e432c 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1447,7 +1447,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
 		memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
 		hdev->scan_rsp_data_len = len;
 
-		cp.handle = 0;
+		cp.handle = instance;
 		cp.length = len;
 		cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
 		cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
@@ -1591,7 +1591,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
 		hdev->adv_data_len = len;
 
 		cp.length = len;
-		cp.handle = 0;
+		cp.handle = instance;
 		cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
 		cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
@@ -1876,7 +1876,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
 
 		memset(&cp, 0, sizeof(cp));
 
-		cp.handle = 0;
+		cp.handle = instance;
 		bacpy(&cp.bdaddr, &random_addr);
 
 		hci_req_add(req,
-- 
cgit v1.2.3


From 69d67b461a180144ad1d31174fadf3e3eda78e56 Mon Sep 17 00:00:00 2001
From: Konstantin Forostyan <konstantin.forostyan@peiker-cee.de>
Date: Mon, 4 May 2020 16:01:49 +0000
Subject: Bluetooth: L2CAP: Fix errors during L2CAP_CREDIT_BASED_CONNECTION_REQ
 (0x17)

Fix 2 typos in L2CAP_CREDIT_BASED_CONNECTION_REQ (0x17) handling function, that
cause BlueZ answer with L2CAP_CR_LE_INVALID_PARAMS or L2CAP_CR_LE_INVALID_SCID
error on a correct ECRED connection request.

Enchanced Credit Based Mode support was recently introduced with the commit
15f02b91056253e8cdc592888f431da0731337b8 ("Bluetooth: L2CAP: Add initial code
for Enhanced Credit Based Mode").

Signed-off-by: Konstantin Forostyan <konstantin.forostyan@peiker-cee.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/l2cap_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index fd9d0d08f9c9..fe913a5c754a 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -5927,7 +5927,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 	if (!enable_ecred)
 		return -EINVAL;
 
-	if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) {
+	if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) {
 		result = L2CAP_CR_LE_INVALID_PARAMS;
 		goto response;
 	}
@@ -5964,7 +5964,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 	}
 
 	result = L2CAP_CR_LE_SUCCESS;
-	cmd_len -= sizeof(req);
+	cmd_len -= sizeof(*req);
 	num_scid = cmd_len / sizeof(u16);
 
 	for (i = 0; i < num_scid; i++) {
-- 
cgit v1.2.3


From 5f4b91728bba007be563fa5a3bd5d96b6a03b3b9 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:46 +0200
Subject: Bluetooth: Add MGMT_EV_PHY_CONFIGURATION_CHANGED to supported list

The event MGMT_EV_PHY_CONFIGURATION_CHANGED wasn't listed in the list of
supported events. So add it.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index f8c0a4fc8090..33b5640ea060 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -147,6 +147,7 @@ static const u16 mgmt_events[] = {
 	MGMT_EV_ADVERTISING_ADDED,
 	MGMT_EV_ADVERTISING_REMOVED,
 	MGMT_EV_EXT_INFO_CHANGED,
+	MGMT_EV_PHY_CONFIGURATION_CHANGED,
 };
 
 static const u16 mgmt_untrusted_commands[] = {
-- 
cgit v1.2.3


From 181d695352305cc52a49c151a1c3370376e54887 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:47 +0200
Subject: Bluetooth: Replace BT_DBG with bt_dev_dbg for management support

The majority of management interaction are based on a controller index
and have a hci_dev associated with it. So use bt_dev_dbg to have a clean
way of indentifying the controller the debug message belongs to.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/mgmt.c | 221 +++++++++++++++++++++++++--------------------------
 1 file changed, 110 insertions(+), 111 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 33b5640ea060..78cf72b64014 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -294,7 +294,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
 {
 	struct mgmt_rp_read_version rp;
 
-	BT_DBG("sock %p", sk);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	mgmt_fill_version_info(&rp);
 
@@ -310,7 +310,7 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data,
 	size_t rp_size;
 	int i, err;
 
-	BT_DBG("sock %p", sk);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
 		num_commands = ARRAY_SIZE(mgmt_commands);
@@ -363,7 +363,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 	u16 count;
 	int err;
 
-	BT_DBG("sock %p", sk);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	read_lock(&hci_dev_list_lock);
 
@@ -397,7 +397,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 		if (d->dev_type == HCI_PRIMARY &&
 		    !hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
-			BT_DBG("Added hci%u", d->id);
+			bt_dev_dbg(hdev, "Added hci%u", d->id);
 		}
 	}
 
@@ -423,7 +423,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 	u16 count;
 	int err;
 
-	BT_DBG("sock %p", sk);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	read_lock(&hci_dev_list_lock);
 
@@ -457,7 +457,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 		if (d->dev_type == HCI_PRIMARY &&
 		    hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
-			BT_DBG("Added hci%u", d->id);
+			bt_dev_dbg(hdev, "Added hci%u", d->id);
 		}
 	}
 
@@ -482,7 +482,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 	u16 count;
 	int err;
 
-	BT_DBG("sock %p", sk);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	read_lock(&hci_dev_list_lock);
 
@@ -524,7 +524,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 
 		rp->entry[count].bus = d->bus;
 		rp->entry[count++].index = cpu_to_le16(d->id);
-		BT_DBG("Added hci%u", d->id);
+		bt_dev_dbg(hdev, "Added hci%u", d->id);
 	}
 
 	rp->num_controllers = cpu_to_le16(count);
@@ -600,7 +600,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_rp_read_config_info rp;
 	u32 options = 0;
 
-	BT_DBG("sock %p %s", sk, hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -940,7 +940,7 @@ static void rpa_expired(struct work_struct *work)
 					    rpa_expired.work);
 	struct hci_request req;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "");
 
 	hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
 
@@ -980,7 +980,7 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_rp_read_info rp;
 
-	BT_DBG("sock %p %s", sk, hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -1036,7 +1036,7 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_rp_read_ext_info *rp = (void *)buf;
 	u16 eir_len;
 
-	BT_DBG("sock %p %s", sk, hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	memset(&buf, 0, sizeof(buf));
 
@@ -1095,7 +1095,7 @@ static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
 
 static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
-	BT_DBG("%s status 0x%02x", hdev->name, status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	if (hci_conn_count(hdev) == 0) {
 		cancel_delayed_work(&hdev->power_off);
@@ -1171,7 +1171,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (cp->val != 0x00 && cp->val != 0x01)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
@@ -1312,7 +1312,7 @@ void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status)
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -1351,7 +1351,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
 	u16 timeout;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
 	    !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
@@ -1477,7 +1477,7 @@ void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status)
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -1537,7 +1537,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
 	    !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
@@ -1594,7 +1594,7 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
 	bool changed;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (cp->val != 0x00 && cp->val != 0x01)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE,
@@ -1638,7 +1638,7 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data,
 	u8 val, status;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	status = mgmt_bredr_support(hdev);
 	if (status)
@@ -1706,7 +1706,7 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	u8 status;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	status = mgmt_bredr_support(hdev);
 	if (status)
@@ -1787,7 +1787,7 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	u8 status;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	status = mgmt_bredr_support(hdev);
 	if (status)
@@ -1893,7 +1893,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	int err;
 	u8 val, enabled;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
@@ -2054,7 +2054,7 @@ unlock:
 
 static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status);
 }
@@ -2067,7 +2067,7 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	struct bt_uuid *uuid;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -2133,7 +2133,7 @@ static bool enable_service_cache(struct hci_dev *hdev)
 
 static void remove_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status);
 }
@@ -2148,7 +2148,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_request req;
 	int err, found;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -2219,7 +2219,7 @@ unlock:
 
 static void set_class_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status);
 }
@@ -2232,7 +2232,7 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_request req;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_bredr_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
@@ -2305,7 +2305,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
 	bool changed;
 	int i;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_bredr_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
@@ -2331,8 +2331,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
 				       MGMT_STATUS_INVALID_PARAMS);
 
-	BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys,
-	       key_count);
+	bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys,
+		   key_count);
 
 	for (i = 0; i < key_count; i++) {
 		struct mgmt_link_key_info *key = &cp->keys[i];
@@ -2533,7 +2533,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_conn *conn;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	memset(&rp, 0, sizeof(rp));
 	bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -2617,7 +2617,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
 	int err;
 	u16 i;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -2693,7 +2693,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -2751,7 +2751,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
 {
 	struct mgmt_cp_set_io_capability *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
@@ -2761,8 +2761,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	hdev->io_capability = cp->io_capability;
 
-	BT_DBG("%s IO capability set to 0x%02x", hdev->name,
-	       hdev->io_capability);
+	bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability);
 
 	hci_dev_unlock(hdev);
 
@@ -2874,7 +2873,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_conn *conn;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	memset(&rp, 0, sizeof(rp));
 	bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -3003,7 +3002,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_conn *conn;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -3114,7 +3113,7 @@ static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_cp_pin_code_neg_reply *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	return user_pairing_resp(sk, hdev, &cp->addr,
 				MGMT_OP_PIN_CODE_NEG_REPLY,
@@ -3126,7 +3125,7 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data,
 {
 	struct mgmt_cp_user_confirm_reply *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (len != sizeof(*cp))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY,
@@ -3142,7 +3141,7 @@ static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_cp_user_confirm_neg_reply *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	return user_pairing_resp(sk, hdev, &cp->addr,
 				 MGMT_OP_USER_CONFIRM_NEG_REPLY,
@@ -3154,7 +3153,7 @@ static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data,
 {
 	struct mgmt_cp_user_passkey_reply *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	return user_pairing_resp(sk, hdev, &cp->addr,
 				 MGMT_OP_USER_PASSKEY_REPLY,
@@ -3166,7 +3165,7 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_cp_user_passkey_neg_reply *cp = data;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	return user_pairing_resp(sk, hdev, &cp->addr,
 				 MGMT_OP_USER_PASSKEY_NEG_REPLY,
@@ -3207,7 +3206,7 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 	struct mgmt_cp_set_local_name *cp;
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -3242,7 +3241,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_request req;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -3311,7 +3310,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
 	u16 appearance;
 	int err;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
@@ -3343,7 +3342,7 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_rp_get_phy_confguration rp;
 
-	BT_DBG("sock %p %s", sk, hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -3376,7 +3375,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -3414,7 +3413,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
 	bool changed = false;
 	int err;
 
-	BT_DBG("sock %p %s", sk, hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	configurable_phys = get_configurable_phys(hdev);
 	supported_phys = get_supported_phys(hdev);
@@ -3567,7 +3566,7 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data,
 	u16 key_count, expected_len;
 	int i;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	key_count = __le16_to_cpu(keys->key_count);
 	if (key_count > max_key_count) {
@@ -3613,7 +3612,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev,
 	int err;
 	bool changed = false;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks))
 		return mgmt_cmd_status(sk, hdev->id,
@@ -3718,7 +3717,7 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
 	size_t rp_size = sizeof(mgmt_rp);
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("%s status %u", hdev->name, status);
+	bt_dev_dbg(hdev, "status %u", status);
 
 	cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev);
 	if (!cmd)
@@ -3777,7 +3776,7 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev,
 	struct hci_request req;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -3827,7 +3826,7 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_addr_info *addr = data;
 	int err;
 
-	BT_DBG("%s ", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!bdaddr_type_is_valid(addr->type))
 		return mgmt_cmd_complete(sk, hdev->id,
@@ -3936,7 +3935,7 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
 	u8 status;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (cp->addr.type != BDADDR_BREDR)
 		return mgmt_cmd_complete(sk, hdev->id,
@@ -3970,7 +3969,7 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status)
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status %d", status);
+	bt_dev_dbg(hdev, "status %d", status);
 
 	hci_dev_lock(hdev);
 
@@ -4031,7 +4030,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
 	u8 status;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -4123,7 +4122,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
 	u8 status;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -4218,7 +4217,7 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status)
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status %d", status);
+	bt_dev_dbg(hdev, "status %d", status);
 
 	hci_dev_lock(hdev);
 
@@ -4244,7 +4243,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -4286,7 +4285,7 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct inquiry_entry *e;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -4328,7 +4327,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data,
 	u8 status;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!bdaddr_type_is_valid(cp->addr.type))
 		return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE,
@@ -4364,7 +4363,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data,
 	u8 status;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!bdaddr_type_is_valid(cp->addr.type))
 		return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE,
@@ -4401,7 +4400,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
 	int err;
 	__u16 source;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	source = __le16_to_cpu(cp->source);
 
@@ -4431,7 +4430,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
 static void enable_advertising_instance(struct hci_dev *hdev, u8 status,
 					u16 opcode)
 {
-	BT_DBG("status %d", status);
+	bt_dev_dbg(hdev, "status %d", status);
 }
 
 static void set_advertising_complete(struct hci_dev *hdev, u8 status,
@@ -4517,7 +4516,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
 	u8 val, status;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	status = mgmt_le_support(hdev);
 	if (status)
@@ -4626,7 +4625,7 @@ static int set_static_address(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_cp_set_static_address *cp = data;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
@@ -4671,7 +4670,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev,
 	__u16 interval, window;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
@@ -4726,7 +4725,7 @@ static void fast_connectable_complete(struct hci_dev *hdev, u8 status,
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -4763,7 +4762,7 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
 	struct hci_request req;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
 	    hdev->hci_ver < BLUETOOTH_VER_1_2)
@@ -4824,7 +4823,7 @@ static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 {
 	struct mgmt_pending_cmd *cmd;
 
-	BT_DBG("status 0x%02x", status);
+	bt_dev_dbg(hdev, "status 0x%02x", status);
 
 	hci_dev_lock(hdev);
 
@@ -4859,7 +4858,7 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	struct hci_request req;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
@@ -4969,7 +4968,7 @@ static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 	struct mgmt_pending_cmd *cmd;
 	struct mgmt_mode *cp;
 
-	BT_DBG("%s status %u", hdev->name, status);
+	bt_dev_dbg(hdev, "status %u", status);
 
 	hci_dev_lock(hdev);
 
@@ -5018,7 +5017,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
 	u8 val;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_sc_capable(hdev) &&
 	    !hci_dev_test_flag(hdev, HCI_LE_ENABLED))
@@ -5104,7 +5103,7 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev,
 	bool changed, use_changed;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS,
@@ -5151,7 +5150,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 	bool changed;
 	int err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
@@ -5226,7 +5225,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 	u16 irk_count, expected_len;
 	int i, err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
@@ -5248,7 +5247,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
 
-	BT_DBG("%s irk_count %u", hdev->name, irk_count);
+	bt_dev_dbg(hdev, "irk_count %u", irk_count);
 
 	for (i = 0; i < irk_count; i++) {
 		struct mgmt_irk_info *key = &cp->irks[i];
@@ -5316,7 +5315,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
 	u16 key_count, expected_len;
 	int i, err;
 
-	BT_DBG("request for %s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
@@ -5338,7 +5337,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
 
-	BT_DBG("%s key_count %u", hdev->name, key_count);
+	bt_dev_dbg(hdev, "key_count %u", key_count);
 
 	for (i = 0; i < key_count; i++) {
 		struct mgmt_ltk_info *key = &cp->keys[i];
@@ -5439,7 +5438,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status,
 	u16 handle;
 	u8 status;
 
-	BT_DBG("status 0x%02x", hci_status);
+	bt_dev_dbg(hdev, "status 0x%02x", hci_status);
 
 	hci_dev_lock(hdev);
 
@@ -5493,7 +5492,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
 	unsigned long conn_info_age;
 	int err = 0;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	memset(&rp, 0, sizeof(rp));
 	bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -5647,7 +5646,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode)
 	struct mgmt_pending_cmd *cmd;
 	struct hci_conn *conn;
 
-	BT_DBG("%s status %u", hdev->name, status);
+	bt_dev_dbg(hdev, "status %u", status);
 
 	hci_dev_lock(hdev);
 
@@ -5684,7 +5683,7 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data,
 	struct hci_conn *conn;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	memset(&rp, 0, sizeof(rp));
 	bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -5805,8 +5804,8 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr,
 
 	params->auto_connect = auto_connect;
 
-	BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
-	       auto_connect);
+	bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u",
+		   addr, addr_type, auto_connect);
 
 	return 0;
 }
@@ -5830,7 +5829,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
 	u8 auto_conn, addr_type;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!bdaddr_type_is_valid(cp->addr.type) ||
 	    !bacmp(&cp->addr.bdaddr, BDADDR_ANY))
@@ -5928,7 +5927,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_cp_remove_device *cp = data;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -6037,7 +6036,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
 			kfree(p);
 		}
 
-		BT_DBG("All LE connection parameters were removed");
+		bt_dev_dbg(hdev, "All LE connection parameters were removed");
 
 		hci_update_background_scan(hdev);
 	}
@@ -6080,7 +6079,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 				       MGMT_STATUS_INVALID_PARAMS);
 	}
 
-	BT_DBG("%s param_count %u", hdev->name, param_count);
+	bt_dev_dbg(hdev, "param_count %u", param_count);
 
 	hci_dev_lock(hdev);
 
@@ -6092,8 +6091,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 		u16 min, max, latency, timeout;
 		u8 addr_type;
 
-		BT_DBG("Adding %pMR (type %u)", &param->addr.bdaddr,
-		       param->addr.type);
+		bt_dev_dbg(hdev, "Adding %pMR (type %u)", &param->addr.bdaddr,
+			   param->addr.type);
 
 		if (param->addr.type == BDADDR_LE_PUBLIC) {
 			addr_type = ADDR_LE_DEV_PUBLIC;
@@ -6109,8 +6108,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
 		latency = le16_to_cpu(param->latency);
 		timeout = le16_to_cpu(param->timeout);
 
-		BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
-		       min, max, latency, timeout);
+		bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
+			   min, max, latency, timeout);
 
 		if (hci_check_conn_params(min, max, latency, timeout) < 0) {
 			bt_dev_err(hdev, "ignoring invalid connection parameters");
@@ -6143,7 +6142,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev,
 	bool changed;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (hdev_is_powered(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
@@ -6199,7 +6198,7 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev,
 	bool changed;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (hdev_is_powered(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
@@ -6254,7 +6253,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
 	u16 eir_len;
 	int err;
 
-	BT_DBG("%s status %u", hdev->name, status);
+	bt_dev_dbg(hdev, "status %u", status);
 
 	cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev);
 	if (!cmd)
@@ -6393,7 +6392,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
 	u8 status, flags, role, addr[7], hash[16], rand[16];
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (hdev_is_powered(hdev)) {
 		switch (cp->type) {
@@ -6580,7 +6579,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
 	u32 supported_flags;
 	u8 *instance;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
@@ -6723,7 +6722,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
 	struct adv_info *adv_instance, *n;
 	u8 instance;
 
-	BT_DBG("status %d", status);
+	bt_dev_dbg(hdev, "status %d", status);
 
 	hci_dev_lock(hdev);
 
@@ -6782,7 +6781,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_pending_cmd *cmd;
 	struct hci_request req;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	status = mgmt_le_support(hdev);
 	if (status)
@@ -6919,7 +6918,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
 	struct mgmt_cp_remove_advertising *cp;
 	struct mgmt_rp_remove_advertising rp;
 
-	BT_DBG("status %d", status);
+	bt_dev_dbg(hdev, "status %d", status);
 
 	hci_dev_lock(hdev);
 
@@ -6951,7 +6950,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
 	struct hci_request req;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	hci_dev_lock(hdev);
 
@@ -7023,7 +7022,7 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
 	u32 flags, supported_flags;
 	int err;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "sock %p", sk);
 
 	if (!lmp_le_capable(hdev))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
@@ -7251,7 +7250,7 @@ void mgmt_power_on(struct hci_dev *hdev, int err)
 {
 	struct cmd_lookup match = { NULL, hdev };
 
-	BT_DBG("err %d", err);
+	bt_dev_dbg(hdev, "err %d", err);
 
 	hci_dev_lock(hdev);
 
@@ -7670,7 +7669,7 @@ int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
 {
 	struct mgmt_ev_user_confirm_request ev;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
 
 	bacpy(&ev.addr.bdaddr, bdaddr);
 	ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -7686,7 +7685,7 @@ int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
 {
 	struct mgmt_ev_user_passkey_request ev;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
 
 	bacpy(&ev.addr.bdaddr, bdaddr);
 	ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -7747,7 +7746,7 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr,
 {
 	struct mgmt_ev_passkey_notify ev;
 
-	BT_DBG("%s", hdev->name);
+	bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
 
 	bacpy(&ev.addr.bdaddr, bdaddr);
 	ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -8166,7 +8165,7 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
 {
 	struct mgmt_ev_discovering ev;
 
-	BT_DBG("%s discovering %u", hdev->name, discovering);
+	bt_dev_dbg(hdev, "discovering %u", discovering);
 
 	memset(&ev, 0, sizeof(ev));
 	ev.type = hdev->discovery.type;
-- 
cgit v1.2.3


From d5cc6626b33780699c7a4986f3521361306862fe Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:49 +0200
Subject: Bluetooth: Introduce HCI_MGMT_HDEV_OPTIONAL option

When setting HCI_MGMT_HDEV_OPTIONAL it is possible to target a specific
conntroller or a global interface.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_sock.c         | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 239ab72f16c6..0c7f3ad76665 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1554,6 +1554,7 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event);
 #define HCI_MGMT_NO_HDEV	BIT(1)
 #define HCI_MGMT_UNTRUSTED	BIT(2)
 #define HCI_MGMT_UNCONFIGURED	BIT(3)
+#define HCI_MGMT_HDEV_OPTIONAL	BIT(4)
 
 struct hci_mgmt_handler {
 	int (*func) (struct sock *sk, struct hci_dev *hdev, void *data,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 9c4a093f8960..caf38a8ea6a8 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1579,11 +1579,13 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
 		}
 	}
 
-	no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
-	if (no_hdev != !hdev) {
-		err = mgmt_cmd_status(sk, index, opcode,
-				      MGMT_STATUS_INVALID_INDEX);
-		goto done;
+	if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) {
+		no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
+		if (no_hdev != !hdev) {
+			err = mgmt_cmd_status(sk, index, opcode,
+					      MGMT_STATUS_INVALID_INDEX);
+			goto done;
+		}
 	}
 
 	var_len = (handler->flags & HCI_MGMT_VAR_LEN);
-- 
cgit v1.2.3


From 568602457c1ab6d26db828de168e4ef35b88f1bc Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:50 +0200
Subject: Bluetooth: Replace BT_DBG with bt_dev_dbg for security manager
 support

The security manager operates on a specific controller and thus use
bt_dev_dbg to indetify the controller for each debug message.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/smp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index df22cbf94693..5510017cf9ff 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -508,7 +508,7 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
 	if (!chan || !chan->data)
 		return false;
 
-	BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk);
+	bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk);
 
 	err = smp_ah(irk, &bdaddr->b[3], hash);
 	if (err)
@@ -534,7 +534,7 @@ int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa)
 	if (err < 0)
 		return err;
 
-	BT_DBG("RPA %pMR", rpa);
+	bt_dev_dbg(hdev, "RPA %pMR", rpa);
 
 	return 0;
 }
@@ -551,7 +551,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
 	smp = chan->data;
 
 	if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
-		BT_DBG("Using debug keys");
+		bt_dev_dbg(hdev, "Using debug keys");
 		err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk);
 		if (err)
 			return err;
@@ -1867,7 +1867,7 @@ static u8 sc_send_public_key(struct smp_chan *smp)
 {
 	struct hci_dev *hdev = smp->conn->hcon->hdev;
 
-	BT_DBG("");
+	bt_dev_dbg(hdev, "");
 
 	if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
 		struct l2cap_chan *chan = hdev->smp_data;
-- 
cgit v1.2.3


From a10c907ce0e5e138c3da091fcb7c3d109a15aec5 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:51 +0200
Subject: Bluetooth: Add support for experimental features configuration

To enable platform specific experimental features, introduce this new set of
management commands and events.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/hci.h  |  1 +
 include/net/bluetooth/mgmt.h | 27 +++++++++++++++++++++
 net/bluetooth/mgmt.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index e5bc1dfe809a..16ab6ce87883 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -258,6 +258,7 @@ enum {
 	HCI_MGMT_DEV_CLASS_EVENTS,
 	HCI_MGMT_LOCAL_NAME_EVENTS,
 	HCI_MGMT_OOB_DATA_EVENTS,
+	HCI_MGMT_EXP_FEATURE_EVENTS,
 };
 
 /*
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 9d4d87c6028e..16e0d87bd8fa 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -681,6 +681,27 @@ struct mgmt_rp_read_security_info {
 	__u8     sec[];
 } __packed;
 
+#define MGMT_OP_READ_EXP_FEATURES_INFO	0x0049
+#define MGMT_READ_EXP_FEATURES_INFO_SIZE 0
+struct mgmt_rp_read_exp_features_info {
+	__le16 feature_count;
+	struct {
+		__u8   uuid[16];
+		__le32 flags;
+	} features[];
+} __packed;
+
+#define MGMT_OP_SET_EXP_FEATURE		0x004a
+struct mgmt_cp_set_exp_feature {
+	__u8   uuid[16];
+	__u8   param[];
+} __packed;
+#define MGMT_SET_EXP_FEATURE_SIZE	16
+struct mgmt_rp_set_exp_feature {
+	__u8   uuid[16];
+	__le32 flags;
+} __packed;
+
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
 	__le16	opcode;
@@ -906,3 +927,9 @@ struct mgmt_ev_ext_info_changed {
 struct mgmt_ev_phy_configuration_changed {
 	__le32	selected_phys;
 } __packed;
+
+#define MGMT_EV_EXP_FEATURE_CHANGED	0x0027
+struct mgmt_ev_exp_feature_changed {
+	__u8	uuid[16];
+	__le32	flags;
+} __packed;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 78cf72b64014..3c6be70d98ef 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -109,6 +109,8 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_SET_BLOCKED_KEYS,
 	MGMT_OP_SET_WIDEBAND_SPEECH,
 	MGMT_OP_READ_SECURITY_INFO,
+	MGMT_OP_READ_EXP_FEATURES_INFO,
+	MGMT_OP_SET_EXP_FEATURE,
 };
 
 static const u16 mgmt_events[] = {
@@ -148,6 +150,7 @@ static const u16 mgmt_events[] = {
 	MGMT_EV_ADVERTISING_REMOVED,
 	MGMT_EV_EXT_INFO_CHANGED,
 	MGMT_EV_PHY_CONFIGURATION_CHANGED,
+	MGMT_EV_EXP_FEATURE_CHANGED,
 };
 
 static const u16 mgmt_untrusted_commands[] = {
@@ -158,6 +161,7 @@ static const u16 mgmt_untrusted_commands[] = {
 	MGMT_OP_READ_EXT_INDEX_LIST,
 	MGMT_OP_READ_EXT_INFO,
 	MGMT_OP_READ_SECURITY_INFO,
+	MGMT_OP_READ_EXP_FEATURES_INFO,
 };
 
 static const u16 mgmt_untrusted_events[] = {
@@ -172,6 +176,7 @@ static const u16 mgmt_untrusted_events[] = {
 	MGMT_EV_EXT_INDEX_ADDED,
 	MGMT_EV_EXT_INDEX_REMOVED,
 	MGMT_EV_EXT_INFO_CHANGED,
+	MGMT_EV_EXP_FEATURE_CHANGED,
 };
 
 #define CACHE_TIMEOUT	msecs_to_jiffies(2 * 1000)
@@ -3710,6 +3715,53 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev,
 				 rp, sizeof(*rp) + sec_len);
 }
 
+static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
+				  void *data, u16 data_len)
+{
+	char buf[42];
+	struct mgmt_rp_read_exp_features_info *rp = (void *)buf;
+	u16 idx = 0;
+
+	bt_dev_dbg(hdev, "sock %p", sk);
+
+	memset(&buf, 0, sizeof(buf));
+
+	rp->feature_count = cpu_to_le16(idx);
+
+	/* After reading the experimental features information, enable
+	 * the events to update client on any future change.
+	 */
+	hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+	return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+				 MGMT_OP_READ_EXP_FEATURES_INFO,
+				 0, rp, sizeof(*rp) + (20 * idx));
+}
+
+static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
+			   void *data, u16 data_len)
+{
+	struct mgmt_cp_set_exp_feature *cp = data;
+	struct mgmt_rp_set_exp_feature rp;
+
+	bt_dev_dbg(hdev, "sock %p", sk);
+
+	if (!memcmp(cp->uuid, ZERO_KEY, 16)) {
+		memset(rp.uuid, 0, 16);
+		rp.flags = cpu_to_le32(0);
+
+		hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+		return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+					 MGMT_OP_SET_EXP_FEATURE, 0,
+					 &rp, sizeof(rp));
+	}
+
+	return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+			       MGMT_OP_SET_EXP_FEATURE,
+			       MGMT_STATUS_NOT_SUPPORTED);
+}
+
 static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
 				         u16 opcode, struct sk_buff *skb)
 {
@@ -7152,6 +7204,12 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ set_wideband_speech,	   MGMT_SETTING_SIZE },
 	{ read_security_info,      MGMT_READ_SECURITY_INFO_SIZE,
 						HCI_MGMT_UNTRUSTED },
+	{ read_exp_features_info,  MGMT_READ_EXP_FEATURES_INFO_SIZE,
+						HCI_MGMT_UNTRUSTED |
+						HCI_MGMT_HDEV_OPTIONAL },
+	{ set_exp_feature,         MGMT_SET_EXP_FEATURE_SIZE,
+						HCI_MGMT_VAR_LEN |
+						HCI_MGMT_HDEV_OPTIONAL },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
-- 
cgit v1.2.3


From e625e50ceee18bc1e3fb1a6375e089405a797a4d Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 6 May 2020 09:57:52 +0200
Subject: Bluetooth: Introduce debug feature when dynamic debug is disabled

In case dynamic debug is disabled, this feature allows a vendor platform
to provide debug statement printing.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 include/net/bluetooth/bluetooth.h | 11 +++++
 net/bluetooth/Kconfig             |  7 ++++
 net/bluetooth/lib.c               | 33 +++++++++++++++
 net/bluetooth/mgmt.c              | 87 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+)

(limited to 'net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 3fa7b1e3c5d9..18190055374c 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -153,6 +153,12 @@ __printf(1, 2)
 void bt_warn(const char *fmt, ...);
 __printf(1, 2)
 void bt_err(const char *fmt, ...);
+#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG)
+void bt_dbg_set(bool enable);
+bool bt_dbg_get(void);
+__printf(1, 2)
+void bt_dbg(const char *fmt, ...);
+#endif
 __printf(1, 2)
 void bt_warn_ratelimited(const char *fmt, ...);
 __printf(1, 2)
@@ -161,7 +167,12 @@ void bt_err_ratelimited(const char *fmt, ...);
 #define BT_INFO(fmt, ...)	bt_info(fmt "\n", ##__VA_ARGS__)
 #define BT_WARN(fmt, ...)	bt_warn(fmt "\n", ##__VA_ARGS__)
 #define BT_ERR(fmt, ...)	bt_err(fmt "\n", ##__VA_ARGS__)
+
+#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG)
+#define BT_DBG(fmt, ...)	bt_dbg(fmt "\n", ##__VA_ARGS__)
+#else
 #define BT_DBG(fmt, ...)	pr_debug(fmt "\n", ##__VA_ARGS__)
+#endif
 
 #define bt_dev_info(hdev, fmt, ...)				\
 	BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 9e25c6570170..1d6d243cdde9 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -135,4 +135,11 @@ config BT_SELFTEST_SMP
 	  Run test cases for SMP cryptographic functionality, including both
 	  legacy SMP as well as the Secure Connections features.
 
+config BT_FEATURE_DEBUG
+	bool "Enable runtime option for debugging statements"
+	depends on BT && !DYNAMIC_DEBUG
+	help
+	  This provides an option to enable/disable debugging statements
+	  at runtime via the experimental features interface.
+
 source "drivers/bluetooth/Kconfig"
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index c09e0a3a0ed9..5326f41a58b7 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -183,6 +183,39 @@ void bt_err(const char *format, ...)
 }
 EXPORT_SYMBOL(bt_err);
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static bool debug_enable;
+
+void bt_dbg_set(bool enable)
+{
+	debug_enable = enable;
+}
+
+bool bt_dbg_get(void)
+{
+	return debug_enable;
+}
+
+void bt_dbg(const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (likely(!debug_enable))
+		return;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	printk(KERN_DEBUG pr_fmt("%pV"), &vaf);
+
+	va_end(args);
+}
+EXPORT_SYMBOL(bt_dbg);
+#endif
+
 void bt_warn_ratelimited(const char *format, ...)
 {
 	struct va_format vaf;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 3c6be70d98ef..9e8a3cccc6ca 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3715,6 +3715,14 @@ static int read_security_info(struct sock *sk, struct hci_dev *hdev,
 				 rp, sizeof(*rp) + sec_len);
 }
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+/* d4992530-b9ec-469f-ab01-6c481c47da1c */
+static const u8 debug_uuid[16] = {
+	0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab,
+	0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4,
+};
+#endif
+
 static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 				  void *data, u16 data_len)
 {
@@ -3726,6 +3734,16 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 
 	memset(&buf, 0, sizeof(buf));
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+	if (!hdev) {
+		u32 flags = bt_dbg_get() ? BIT(0) : 0;
+
+		memcpy(rp->features[idx].uuid, debug_uuid, 16);
+		rp->features[idx].flags = cpu_to_le32(flags);
+		idx++;
+	}
+#endif
+
 	rp->feature_count = cpu_to_le16(idx);
 
 	/* After reading the experimental features information, enable
@@ -3738,6 +3756,21 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 				 0, rp, sizeof(*rp) + (20 * idx));
 }
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static int exp_debug_feature_changed(bool enabled, struct sock *skip)
+{
+	struct mgmt_ev_exp_feature_changed ev;
+
+	memset(&ev, 0, sizeof(ev));
+	memcpy(ev.uuid, debug_uuid, 16);
+	ev.flags = cpu_to_le32(enabled ? BIT(0) : 0);
+
+	return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL,
+				  &ev, sizeof(ev),
+				  HCI_MGMT_EXP_FEATURE_EVENTS, skip);
+}
+#endif
+
 static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
 			   void *data, u16 data_len)
 {
@@ -3750,6 +3783,17 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
 		memset(rp.uuid, 0, 16);
 		rp.flags = cpu_to_le32(0);
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+		if (!hdev) {
+			bool changed = bt_dbg_get();
+
+			bt_dbg_set(false);
+
+			if (changed)
+				exp_debug_feature_changed(false, sk);
+		}
+#endif
+
 		hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
 
 		return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
@@ -3757,6 +3801,49 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
 					 &rp, sizeof(rp));
 	}
 
+#ifdef CONFIG_BT_FEATURE_DEBUG
+	if (!memcmp(cp->uuid, debug_uuid, 16)) {
+		bool val, changed;
+		int err;
+
+		/* Command requires to use the non-controller index */
+		if (hdev)
+			return mgmt_cmd_status(sk, hdev->id,
+					       MGMT_OP_SET_EXP_FEATURE,
+					       MGMT_STATUS_INVALID_INDEX);
+
+		/* Parameters are limited to a single octet */
+		if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+			return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+					       MGMT_OP_SET_EXP_FEATURE,
+					       MGMT_STATUS_INVALID_PARAMS);
+
+		/* Only boolean on/off is supported */
+		if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+			return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+					       MGMT_OP_SET_EXP_FEATURE,
+					       MGMT_STATUS_INVALID_PARAMS);
+
+		val = !!cp->param[0];
+		changed = val ? !bt_dbg_get() : bt_dbg_get();
+		bt_dbg_set(val);
+
+		memcpy(rp.uuid, debug_uuid, 16);
+		rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+		hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+		err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+					MGMT_OP_SET_EXP_FEATURE, 0,
+					&rp, sizeof(rp));
+
+		if (changed)
+			exp_debug_feature_changed(val, sk);
+
+		return err;
+	}
+#endif
+
 	return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
 			       MGMT_OP_SET_EXP_FEATURE,
 			       MGMT_STATUS_NOT_SUPPORTED);
-- 
cgit v1.2.3


From 7d4343d501f9b5ddbc92f278adba339d16d010e1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 May 2020 10:33:42 +0200
Subject: xfrm: fix unused variable warning if CONFIG_NETFILTER=n

After recent change 'x' is only used when CONFIG_NETFILTER is set:

net/ipv4/xfrm4_output.c: In function '__xfrm4_output':
net/ipv4/xfrm4_output.c:19:21: warning: unused variable 'x' [-Wunused-variable]
   19 |  struct xfrm_state *x = skb_dst(skb)->xfrm;

Expand the CONFIG_NETFILTER scope to avoid this.

Fixes: 2ab6096db2f1 ("xfrm: remove output_finish indirection from xfrm_state_afinfo")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 502eb189d852..3cff51ba72bb 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -16,9 +16,9 @@
 
 static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+#ifdef CONFIG_NETFILTER
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
 
-#ifdef CONFIG_NETFILTER
 	if (!x) {
 		IPCB(skb)->flags |= IPSKB_REROUTED;
 		return dst_output(net, sk, skb);
-- 
cgit v1.2.3


From 2fa3888bb7a9fb3966c77555c8cd6f4bd6a1439a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 11 May 2020 16:47:14 -0700
Subject: net: dsa: ocelot: Constify dsa_device_ops

ocelot_netdev_ops should be const since that is what the DSA layer
expects.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_ocelot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 59de1315100f..b0c98ee4e13b 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -228,7 +228,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	return skb;
 }
 
-static struct dsa_device_ops ocelot_netdev_ops = {
+static const struct dsa_device_ops ocelot_netdev_ops = {
 	.name			= "ocelot",
 	.proto			= DSA_TAG_PROTO_OCELOT,
 	.xmit			= ocelot_xmit,
-- 
cgit v1.2.3


From 097f024454fca0e13bbba0ab54dfe63ac5610953 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 11 May 2020 16:47:15 -0700
Subject: net: dsa: tag_sja1105: Constify dsa_device_ops

sja1105_netdev_ops should be const since that is what the DSA layer
expects.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_sja1105.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index d553bf36bd41..5ecac5921a7d 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -304,7 +304,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 					      is_meta);
 }
 
-static struct dsa_device_ops sja1105_netdev_ops = {
+static const struct dsa_device_ops sja1105_netdev_ops = {
 	.name = "sja1105",
 	.proto = DSA_TAG_PROTO_SJA1105,
 	.xmit = sja1105_xmit,
-- 
cgit v1.2.3


From 0462b6bdb6445b887b8896f28be92e0d94c92e7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2020 13:59:11 +0200
Subject: net: add a CMSG_USER_DATA macro

Add a variant of CMSG_DATA that operates on user pointer to avoid
sparse warnings about casting to/from user pointers.  Also fix up
CMSG_DATA to rely on the gcc extension that allows void pointer
arithmetics to cut down on the amount of casts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 5 ++++-
 net/core/scm.c         | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 54338fac45cb..4cc64d611cf4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -94,7 +94,10 @@ struct cmsghdr {
 
 #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) )
 
-#define CMSG_DATA(cmsg)	((void *)((char *)(cmsg) + sizeof(struct cmsghdr)))
+#define CMSG_DATA(cmsg) \
+	((void *)(cmsg) + sizeof(struct cmsghdr))
+#define CMSG_USER_DATA(cmsg) \
+	((void __user *)(cmsg) + sizeof(struct cmsghdr))
 #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len))
 #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len))
 
diff --git a/net/core/scm.c b/net/core/scm.c
index dc6fed1f221c..abfdc85a64c1 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -236,7 +236,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
 	err = -EFAULT;
 	if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
 		goto out;
-	if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+	if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm)))
 		goto out;
 	cmlen = CMSG_SPACE(len);
 	if (msg->msg_controllen < cmlen)
@@ -300,7 +300,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 	if (fdnum < fdmax)
 		fdmax = fdnum;
 
-	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+	for (i=0, cmfptr =(int __user *)CMSG_USER_DATA(cm); i<fdmax;
 	     i++, cmfptr++)
 	{
 		struct socket *sock;
-- 
cgit v1.2.3


From 2618d530dd8b7ac0fdcb83f4c95b88f7b0d37ce6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2020 13:59:12 +0200
Subject: net/scm: cleanup scm_detach_fds

Factor out two helpes to keep the code tidy.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/scm.c | 94 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 51 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/core/scm.c b/net/core/scm.c
index abfdc85a64c1..168b006a52ff 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -277,78 +277,86 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter
 }
 EXPORT_SYMBOL(put_cmsg_scm_timestamping);
 
+static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags)
+{
+	struct socket *sock;
+	int new_fd;
+	int error;
+
+	error = security_file_receive(file);
+	if (error)
+		return error;
+
+	new_fd = get_unused_fd_flags(o_flags);
+	if (new_fd < 0)
+		return new_fd;
+
+	error = put_user(new_fd, ufd);
+	if (error) {
+		put_unused_fd(new_fd);
+		return error;
+	}
+
+	/* Bump the usage count and install the file. */
+	sock = sock_from_file(file, &error);
+	if (sock) {
+		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+		sock_update_classid(&sock->sk->sk_cgrp_data);
+	}
+	fd_install(new_fd, get_file(file));
+	return error;
+}
+
+static int scm_max_fds(struct msghdr *msg)
+{
+	if (msg->msg_controllen <= sizeof(struct cmsghdr))
+		return 0;
+	return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
+}
+
 void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 {
 	struct cmsghdr __user *cm
 		= (__force struct cmsghdr __user*)msg->msg_control;
-
-	int fdmax = 0;
-	int fdnum = scm->fp->count;
-	struct file **fp = scm->fp->fp;
-	int __user *cmfptr;
+	int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+	int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
+	int __user *cmsg_data = CMSG_USER_DATA(cm);
 	int err = 0, i;
 
-	if (MSG_CMSG_COMPAT & msg->msg_flags) {
+	if (msg->msg_flags & MSG_CMSG_COMPAT) {
 		scm_detach_fds_compat(msg, scm);
 		return;
 	}
 
-	if (msg->msg_controllen > sizeof(struct cmsghdr))
-		fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
-			 / sizeof(int));
-
-	if (fdnum < fdmax)
-		fdmax = fdnum;
-
-	for (i=0, cmfptr =(int __user *)CMSG_USER_DATA(cm); i<fdmax;
-	     i++, cmfptr++)
-	{
-		struct socket *sock;
-		int new_fd;
-		err = security_file_receive(fp[i]);
+	for (i = 0; i < fdmax; i++) {
+		err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
 		if (err)
 			break;
-		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
-					  ? O_CLOEXEC : 0);
-		if (err < 0)
-			break;
-		new_fd = err;
-		err = put_user(new_fd, cmfptr);
-		if (err) {
-			put_unused_fd(new_fd);
-			break;
-		}
-		/* Bump the usage count and install the file. */
-		sock = sock_from_file(fp[i], &err);
-		if (sock) {
-			sock_update_netprioidx(&sock->sk->sk_cgrp_data);
-			sock_update_classid(&sock->sk->sk_cgrp_data);
-		}
-		fd_install(new_fd, get_file(fp[i]));
 	}
 
-	if (i > 0)
-	{
-		int cmlen = CMSG_LEN(i*sizeof(int));
+	if (i > 0)  {
+		int cmlen = CMSG_LEN(i * sizeof(int));
+
 		err = put_user(SOL_SOCKET, &cm->cmsg_level);
 		if (!err)
 			err = put_user(SCM_RIGHTS, &cm->cmsg_type);
 		if (!err)
 			err = put_user(cmlen, &cm->cmsg_len);
 		if (!err) {
-			cmlen = CMSG_SPACE(i*sizeof(int));
+			cmlen = CMSG_SPACE(i * sizeof(int));
 			if (msg->msg_controllen < cmlen)
 				cmlen = msg->msg_controllen;
 			msg->msg_control += cmlen;
 			msg->msg_controllen -= cmlen;
 		}
 	}
-	if (i < fdnum || (fdnum && fdmax <= 0))
+
+	if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
 		msg->msg_flags |= MSG_CTRUNC;
 
 	/*
-	 * All of the files that fit in the message have had their
-	 * usage counts incremented, so we just free the list.
+	 * All of the files that fit in the message have had their usage counts
+	 * incremented, so we just free the list.
 	 */
 	__scm_destroy(scm);
 }
-- 
cgit v1.2.3


From 1f466e1f15cf1dac7c86798d694649fc42cd868a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2020 13:59:13 +0200
Subject: net: cleanly handle kernel vs user buffers for ->msg_control

The msg_control field in struct msghdr can either contain a user
pointer when used with the recvmsg system call, or a kernel pointer
when used with sendmsg.  To complicate things further kernel_recvmsg
can stuff a kernel pointer in and then use set_fs to make the uaccess
helpers accept it.

Replace it with a union of a kernel pointer msg_control field, and
a user pointer msg_control_user one, and allow kernel_recvmsg operate
on a proper kernel pointer using a bitfield to override the normal
choice of a user pointer for recvmsg.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 12 +++++++++++-
 net/compat.c           |  5 +++--
 net/core/scm.c         | 49 ++++++++++++++++++++++++++++---------------------
 net/ipv4/ip_sockglue.c |  3 ++-
 net/socket.c           | 22 ++++++----------------
 5 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 4cc64d611cf4..04d2bc97f497 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -50,7 +50,17 @@ struct msghdr {
 	void		*msg_name;	/* ptr to socket address structure */
 	int		msg_namelen;	/* size of socket address structure */
 	struct iov_iter	msg_iter;	/* data */
-	void		*msg_control;	/* ancillary data */
+
+	/*
+	 * Ancillary data. msg_control_user is the user buffer used for the
+	 * recv* side when msg_control_is_user is set, msg_control is the kernel
+	 * buffer used for all other cases.
+	 */
+	union {
+		void		*msg_control;
+		void __user	*msg_control_user;
+	};
+	bool		msg_control_is_user : 1;
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	unsigned int	msg_flags;	/* flags on received message */
 	struct kiocb	*msg_iocb;	/* ptr to iocb for async requests */
diff --git a/net/compat.c b/net/compat.c
index 4bed96e84d9a..69fc6d1e4e6e 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -56,7 +56,8 @@ int __get_compat_msghdr(struct msghdr *kmsg,
 	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
 
-	kmsg->msg_control = compat_ptr(msg.msg_control);
+	kmsg->msg_control_is_user = true;
+	kmsg->msg_control_user = compat_ptr(msg.msg_control);
 	kmsg->msg_controllen = msg.msg_controllen;
 
 	if (save_addr)
@@ -121,7 +122,7 @@ int get_compat_msghdr(struct msghdr *kmsg,
 	((ucmlen) >= sizeof(struct compat_cmsghdr) && \
 	 (ucmlen) <= (unsigned long) \
 	 ((mhdr)->msg_controllen - \
-	  ((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
+	  ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user)))
 
 static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
 		struct compat_cmsghdr __user *cmsg, int cmsg_len)
diff --git a/net/core/scm.c b/net/core/scm.c
index 168b006a52ff..a75cd637a71f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send);
 
 int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
 {
-	struct cmsghdr __user *cm
-		= (__force struct cmsghdr __user *)msg->msg_control;
-	struct cmsghdr cmhdr;
 	int cmlen = CMSG_LEN(len);
-	int err;
 
-	if (MSG_CMSG_COMPAT & msg->msg_flags)
+	if (msg->msg_flags & MSG_CMSG_COMPAT)
 		return put_cmsg_compat(msg, level, type, len, data);
 
-	if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+	if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) {
 		msg->msg_flags |= MSG_CTRUNC;
 		return 0; /* XXX: return error? check spec. */
 	}
@@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
 		msg->msg_flags |= MSG_CTRUNC;
 		cmlen = msg->msg_controllen;
 	}
-	cmhdr.cmsg_level = level;
-	cmhdr.cmsg_type = type;
-	cmhdr.cmsg_len = cmlen;
-
-	err = -EFAULT;
-	if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
-		goto out;
-	if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm)))
-		goto out;
-	cmlen = CMSG_SPACE(len);
-	if (msg->msg_controllen < cmlen)
-		cmlen = msg->msg_controllen;
+
+	if (msg->msg_control_is_user) {
+		struct cmsghdr __user *cm = msg->msg_control_user;
+		struct cmsghdr cmhdr;
+
+		cmhdr.cmsg_level = level;
+		cmhdr.cmsg_type = type;
+		cmhdr.cmsg_len = cmlen;
+		if (copy_to_user(cm, &cmhdr, sizeof cmhdr) ||
+		    copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm)))
+			return -EFAULT;
+	} else {
+		struct cmsghdr *cm = msg->msg_control;
+
+		cm->cmsg_level = level;
+		cm->cmsg_type = type;
+		cm->cmsg_len = cmlen;
+		memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm));
+	}
+
+	cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
 	msg->msg_control += cmlen;
 	msg->msg_controllen -= cmlen;
-	err = 0;
-out:
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(put_cmsg);
 
@@ -328,6 +331,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 		return;
 	}
 
+	/* no use for FD passing from kernel space callers */
+	if (WARN_ON_ONCE(!msg->msg_control_is_user))
+		return;
+
 	for (i = 0; i < fdmax; i++) {
 		err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
 		if (err)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index aa3fd61818c4..8206047d70b6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1492,7 +1492,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		if (sk->sk_type != SOCK_STREAM)
 			return -ENOPROTOOPT;
 
-		msg.msg_control = (__force void *) optval;
+		msg.msg_control_is_user = true;
+		msg.msg_control_user = optval;
 		msg.msg_controllen = len;
 		msg.msg_flags = flags;
 
diff --git a/net/socket.c b/net/socket.c
index 2dd739fba866..1c9a7260a41d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -924,14 +924,9 @@ EXPORT_SYMBOL(sock_recvmsg);
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
-	mm_segment_t oldfs = get_fs();
-	int result;
-
+	msg->msg_control_is_user = false;
 	iov_iter_kvec(&msg->msg_iter, READ, vec, num, size);
-	set_fs(KERNEL_DS);
-	result = sock_recvmsg(sock, msg, flags);
-	set_fs(oldfs);
-	return result;
+	return sock_recvmsg(sock, msg, flags);
 }
 EXPORT_SYMBOL(kernel_recvmsg);
 
@@ -2239,7 +2234,8 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
 	if (copy_from_user(&msg, umsg, sizeof(*umsg)))
 		return -EFAULT;
 
-	kmsg->msg_control = (void __force *)msg.msg_control;
+	kmsg->msg_control_is_user = true;
+	kmsg->msg_control_user = msg.msg_control;
 	kmsg->msg_controllen = msg.msg_controllen;
 	kmsg->msg_flags = msg.msg_flags;
 
@@ -2331,16 +2327,10 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
 				goto out;
 		}
 		err = -EFAULT;
-		/*
-		 * Careful! Before this, msg_sys->msg_control contains a user pointer.
-		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
-		 * checking falls down on this.
-		 */
-		if (copy_from_user(ctl_buf,
-				   (void __user __force *)msg_sys->msg_control,
-				   ctl_len))
+		if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len))
 			goto out_freectl;
 		msg_sys->msg_control = ctl_buf;
+		msg_sys->msg_control_is_user = false;
 	}
 	msg_sys->msg_flags = flags;
 
-- 
cgit v1.2.3


From 54a0ed0df49609f4e3f098f8943e38e389dc2e15 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 May 2020 20:20:25 +0300
Subject: net: dsa: provide an option for drivers to always receive bridge
 VLANs

DSA assumes that a bridge which has vlan filtering disabled is not
vlan aware, and ignores all vlan configuration. However, the kernel
software bridge code allows configuration in this state.

This causes the kernel's idea of the bridge vlan state and the
hardware state to disagree, so "bridge vlan show" indicates a correct
configuration but the hardware lacks all configuration. Even worse,
enabling vlan filtering on a DSA bridge immediately blocks all traffic
which, given the output of "bridge vlan show", is very confusing.

Provide an option that drivers can set to indicate they want to receive
vlan configuration even when vlan filtering is disabled. At the very
least, this is safe for Marvell DSA bridges, which do not look up
ingress traffic in the VTU if the port is in 8021Q disabled state. It is
also safe for the Ocelot switch family. Whether this change is suitable
for all DSA bridges is not known.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  7 +++++++
 net/dsa/dsa_priv.h |  1 +
 net/dsa/port.c     | 14 ++++++++++++++
 net/dsa/slave.c    |  8 ++++----
 4 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 312c2f067e65..50389772c597 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -282,6 +282,13 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering_is_global;
 
+	/* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges
+	 * that have vlan_filtering=0. All drivers should ideally set this (and
+	 * then the option would get removed), but it is unknown whether this
+	 * would break things or not.
+	 */
+	bool			configure_vlan_while_not_filtering;
+
 	/* In case vlan_filtering_is_global is set, the VLAN awareness state
 	 * should be retrieved from here and not from the per-port settings.
 	 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index a1a0ae242012..adecf73bd608 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -138,6 +138,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
 int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 			    struct switchdev_trans *trans);
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 			 struct switchdev_trans *trans);
 int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ebc8d6cbd1d4..e23ece229c7e 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -257,6 +257,20 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 	return 0;
 }
 
+/* This enforces legacy behavior for switch drivers which assume they can't
+ * receive VLAN configuration when enslaved to a bridge with vlan_filtering=0
+ */
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp)
+{
+	struct dsa_switch *ds = dp->ds;
+
+	if (!dp->bridge_dev)
+		return false;
+
+	return (!ds->configure_vlan_while_not_filtering &&
+		!br_vlan_enabled(dp->bridge_dev));
+}
+
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 			 struct switchdev_trans *trans)
 {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 61b0de52040a..886490fb203d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -314,7 +314,7 @@ static int dsa_slave_vlan_add(struct net_device *dev,
 	if (obj->orig_dev != dev)
 		return -EOPNOTSUPP;
 
-	if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+	if (dsa_port_skip_vlan_configuration(dp))
 		return 0;
 
 	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
@@ -381,7 +381,7 @@ static int dsa_slave_vlan_del(struct net_device *dev,
 	if (obj->orig_dev != dev)
 		return -EOPNOTSUPP;
 
-	if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+	if (dsa_port_skip_vlan_configuration(dp))
 		return 0;
 
 	/* Do not deprogram the CPU port as it may be shared with other user
@@ -1240,7 +1240,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
 	 * need to emulate the switchdev prepare + commit phase.
 	 */
 	if (dp->bridge_dev) {
-		if (!br_vlan_enabled(dp->bridge_dev))
+		if (dsa_port_skip_vlan_configuration(dp))
 			return 0;
 
 		/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
@@ -1274,7 +1274,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
 	 * need to emulate the switchdev prepare + commit phase.
 	 */
 	if (dp->bridge_dev) {
-		if (!br_vlan_enabled(dp->bridge_dev))
+		if (dsa_port_skip_vlan_configuration(dp))
 			return 0;
 
 		/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
-- 
cgit v1.2.3


From 1f66b0f0aec671f8fbc86d75b2efdf7c7e0f7880 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 May 2020 20:20:26 +0300
Subject: net: dsa: tag_8021q: introduce a vid_is_dsa_8021q helper

This function returns a boolean denoting whether the VLAN passed as
argument is part of the 1024-3071 range that the dsa_8021q tagging
scheme uses.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 7 +++++++
 net/dsa/tag_8021q.c       | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'net')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index b8daaec0896e..ebc245ff838a 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -50,6 +50,8 @@ int dsa_8021q_rx_switch_id(u16 vid);
 
 int dsa_8021q_rx_source_port(u16 vid);
 
+bool vid_is_dsa_8021q(u16 vid);
+
 #else
 
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
@@ -107,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid)
 	return 0;
 }
 
+bool vid_is_dsa_8021q(u16 vid)
+{
+	return false;
+}
+
 #endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */
 
 #endif /* _NET_DSA_8021Q_H */
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index ff9c5bf64bda..4774ecd1f8fc 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -93,6 +93,13 @@ int dsa_8021q_rx_source_port(u16 vid)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
 
+bool vid_is_dsa_8021q(u16 vid)
+{
+	return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX ||
+		(vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX);
+}
+EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
+
 static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
 {
 	struct bridge_vlan_info vinfo;
-- 
cgit v1.2.3


From ec5ae61076d07be986df19773662506220757c9f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 May 2020 20:20:29 +0300
Subject: net: dsa: sja1105: save/restore VLANs using a delta commit method

Managing the VLAN table that is present in hardware will become very
difficult once we add a third operating state
(best_effort_vlan_filtering). That is because correct cleanup (not too
little, not too much) becomes virtually impossible, when VLANs can be
added from the bridge layer, from dsa_8021q for basic tagging, for
cross-chip bridging, as well as retagging rules for sub-VLANs and
cross-chip sub-VLANs. So we need to rethink VLAN interaction with the
switch in a more scalable way.

In preparation for that, use the priv->expect_dsa_8021q boolean to
classify any VLAN request received through .port_vlan_add or
.port_vlan_del towards either one of 2 internal lists: bridge VLANs and
dsa_8021q VLANs.

Then, implement a central sja1105_build_vlan_table method that creates a
VLAN configuration from scratch based on the 2 lists of VLANs kept by
the driver, and based on the VLAN awareness state. Currently, if we are
VLAN-unaware, install the dsa_8021q VLANs, otherwise the bridge VLANs.

Then, implement a delta commit procedure that identifies which VLANs
from this new configuration are actually different from the config
previously committed to hardware. We apply the delta through the dynamic
configuration interface (we don't reset the switch). The result is that
the hardware should see the exact sequence of operations as before this
patch.

This also helps remove the "br" argument passed to
dsa_8021q_crosschip_bridge_join, which it was only using to figure out
whether it should commit the configuration back to us or not, based on
the VLAN awareness state of the bridge. We can simplify that, by always
allowing those VLANs inside of our dsa_8021q_vlans list, and committing
those to hardware when necessary.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  10 +
 drivers/net/dsa/sja1105/sja1105_main.c | 493 ++++++++++++++++++++++++---------
 include/linux/dsa/8021q.h              |  19 +-
 net/dsa/tag_8021q.c                    |  45 ++-
 4 files changed, 393 insertions(+), 174 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 667056d0c819..c80f1999c694 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -178,6 +178,14 @@ struct sja1105_flow_block {
 	int num_virtual_links;
 };
 
+struct sja1105_bridge_vlan {
+	struct list_head list;
+	int port;
+	u16 vid;
+	bool pvid;
+	bool untagged;
+};
+
 enum sja1105_vlan_state {
 	SJA1105_VLAN_UNAWARE,
 	SJA1105_VLAN_FILTERING_FULL,
@@ -191,6 +199,8 @@ struct sja1105_private {
 	struct gpio_desc *reset_gpio;
 	struct spi_device *spidev;
 	struct dsa_switch *ds;
+	struct list_head dsa_8021q_vlans;
+	struct list_head bridge_vlans;
 	struct list_head crosschip_links;
 	struct sja1105_flow_block flow_block;
 	struct sja1105_port ports[SJA1105_NUM_PORTS];
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 8e68adba9144..fb95130299b1 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -303,7 +303,8 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv)
 		.tag_port = 0,
 		.vlanid = 1,
 	};
-	int i;
+	struct dsa_switch *ds = priv->ds;
+	int port;
 
 	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
 
@@ -324,12 +325,31 @@ static int sja1105_init_static_vlan(struct sja1105_private *priv)
 	table->entry_count = 1;
 
 	/* VLAN 1: all DT-defined ports are members; no restrictions on
-	 * forwarding; always transmit priority-tagged frames as untagged.
+	 * forwarding; always transmit as untagged.
 	 */
-	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
-		pvid.vmemb_port |= BIT(i);
-		pvid.vlan_bc |= BIT(i);
-		pvid.tag_port &= ~BIT(i);
+	for (port = 0; port < ds->num_ports; port++) {
+		struct sja1105_bridge_vlan *v;
+
+		if (dsa_is_unused_port(ds, port))
+			continue;
+
+		pvid.vmemb_port |= BIT(port);
+		pvid.vlan_bc |= BIT(port);
+		pvid.tag_port &= ~BIT(port);
+
+		/* Let traffic that don't need dsa_8021q (e.g. STP, PTP) be
+		 * transmitted as untagged.
+		 */
+		v = kzalloc(sizeof(*v), GFP_KERNEL);
+		if (!v)
+			return -ENOMEM;
+
+		v->port = port;
+		v->vid = 1;
+		v->untagged = true;
+		if (dsa_is_cpu_port(ds, port))
+			v->pvid = true;
+		list_add(&v->list, &priv->dsa_8021q_vlans);
 	}
 
 	((struct sja1105_vlan_lookup_entry *)table->entries)[0] = pvid;
@@ -1717,82 +1737,6 @@ static int sja1105_pvid_apply(struct sja1105_private *priv, int port, u16 pvid)
 					   &mac[port], true);
 }
 
-static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid)
-{
-	struct sja1105_vlan_lookup_entry *vlan;
-	int count, i;
-
-	vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries;
-	count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count;
-
-	for (i = 0; i < count; i++)
-		if (vlan[i].vlanid == vid)
-			return i;
-
-	/* Return an invalid entry index if not found */
-	return -1;
-}
-
-static int sja1105_vlan_apply(struct sja1105_private *priv, int port, u16 vid,
-			      bool enabled, bool untagged)
-{
-	struct sja1105_vlan_lookup_entry *vlan;
-	struct sja1105_table *table;
-	bool keep = true;
-	int match, rc;
-
-	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
-
-	match = sja1105_is_vlan_configured(priv, vid);
-	if (match < 0) {
-		/* Can't delete a missing entry. */
-		if (!enabled)
-			return 0;
-		rc = sja1105_table_resize(table, table->entry_count + 1);
-		if (rc)
-			return rc;
-		match = table->entry_count - 1;
-	}
-	/* Assign pointer after the resize (it's new memory) */
-	vlan = table->entries;
-	vlan[match].vlanid = vid;
-	if (enabled) {
-		vlan[match].vlan_bc |= BIT(port);
-		vlan[match].vmemb_port |= BIT(port);
-	} else {
-		vlan[match].vlan_bc &= ~BIT(port);
-		vlan[match].vmemb_port &= ~BIT(port);
-	}
-	/* Also unset tag_port if removing this VLAN was requested,
-	 * just so we don't have a confusing bitmap (no practical purpose).
-	 */
-	if (untagged || !enabled)
-		vlan[match].tag_port &= ~BIT(port);
-	else
-		vlan[match].tag_port |= BIT(port);
-	/* If there's no port left as member of this VLAN,
-	 * it's time for it to go.
-	 */
-	if (!vlan[match].vmemb_port)
-		keep = false;
-
-	dev_dbg(priv->ds->dev,
-		"%s: port %d, vid %llu, broadcast domain 0x%llx, "
-		"port members 0x%llx, tagged ports 0x%llx, keep %d\n",
-		__func__, port, vlan[match].vlanid, vlan[match].vlan_bc,
-		vlan[match].vmemb_port, vlan[match].tag_port, keep);
-
-	rc = sja1105_dynamic_config_write(priv, BLK_IDX_VLAN_LOOKUP, vid,
-					  &vlan[match], keep);
-	if (rc < 0)
-		return rc;
-
-	if (!keep)
-		return sja1105_table_delete_entry(table, match);
-
-	return 0;
-}
-
 static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
 					 int tree_index, int sw_index,
 					 int other_port, struct net_device *br)
@@ -1813,7 +1757,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
 
 		other_priv->expect_dsa_8021q = true;
 		rc = dsa_8021q_crosschip_bridge_join(ds, port, other_ds,
-						     other_port, br,
+						     other_port,
 						     &priv->crosschip_links);
 		other_priv->expect_dsa_8021q = false;
 		if (rc)
@@ -1821,7 +1765,7 @@ static int sja1105_crosschip_bridge_join(struct dsa_switch *ds,
 
 		priv->expect_dsa_8021q = true;
 		rc = dsa_8021q_crosschip_bridge_join(other_ds, other_port, ds,
-						     port, br,
+						     port,
 						     &other_priv->crosschip_links);
 		priv->expect_dsa_8021q = false;
 		if (rc)
@@ -1852,35 +1796,16 @@ static void sja1105_crosschip_bridge_leave(struct dsa_switch *ds,
 
 		other_priv->expect_dsa_8021q = true;
 		dsa_8021q_crosschip_bridge_leave(ds, port, other_ds, other_port,
-						 br, &priv->crosschip_links);
+						 &priv->crosschip_links);
 		other_priv->expect_dsa_8021q = false;
 
 		priv->expect_dsa_8021q = true;
-		dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds,
-						 port, br,
+		dsa_8021q_crosschip_bridge_leave(other_ds, other_port, ds, port,
 						 &other_priv->crosschip_links);
 		priv->expect_dsa_8021q = false;
 	}
 }
 
-static int sja1105_replay_crosschip_vlans(struct dsa_switch *ds, bool enabled)
-{
-	struct sja1105_private *priv = ds->priv;
-	struct dsa_8021q_crosschip_link *c;
-	int rc;
-
-	list_for_each_entry(c, &priv->crosschip_links, list) {
-		priv->expect_dsa_8021q = true;
-		rc = dsa_8021q_crosschip_link_apply(ds, c->port, c->other_ds,
-						    c->other_port, enabled);
-		priv->expect_dsa_8021q = false;
-		if (rc)
-			break;
-	}
-
-	return rc;
-}
-
 static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
 {
 	struct sja1105_private *priv = ds->priv;
@@ -1896,11 +1821,6 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled)
 			return rc;
 		}
 	}
-	rc = sja1105_replay_crosschip_vlans(ds, enabled);
-	if (rc) {
-		dev_err(ds->dev, "Failed to replay crosschip VLANs: %d\n", rc);
-		return rc;
-	}
 
 	dev_info(ds->dev, "%s switch tagging\n",
 		 enabled ? "Enabled" : "Disabled");
@@ -1914,6 +1834,269 @@ sja1105_get_tag_protocol(struct dsa_switch *ds, int port,
 	return DSA_TAG_PROTO_SJA1105;
 }
 
+static int sja1105_is_vlan_configured(struct sja1105_private *priv, u16 vid)
+{
+	struct sja1105_vlan_lookup_entry *vlan;
+	int count, i;
+
+	vlan = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entries;
+	count = priv->static_config.tables[BLK_IDX_VLAN_LOOKUP].entry_count;
+
+	for (i = 0; i < count; i++)
+		if (vlan[i].vlanid == vid)
+			return i;
+
+	/* Return an invalid entry index if not found */
+	return -1;
+}
+
+static int sja1105_commit_vlans(struct sja1105_private *priv,
+				struct sja1105_vlan_lookup_entry *new_vlan)
+{
+	struct sja1105_vlan_lookup_entry *vlan;
+	struct sja1105_table *table;
+	int num_vlans = 0;
+	int rc, i, k = 0;
+
+	/* VLAN table */
+	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
+	vlan = table->entries;
+
+	for (i = 0; i < VLAN_N_VID; i++) {
+		int match = sja1105_is_vlan_configured(priv, i);
+
+		if (new_vlan[i].vlanid != VLAN_N_VID)
+			num_vlans++;
+
+		if (new_vlan[i].vlanid == VLAN_N_VID && match >= 0) {
+			/* Was there before, no longer is. Delete */
+			dev_dbg(priv->ds->dev, "Deleting VLAN %d\n", i);
+			rc = sja1105_dynamic_config_write(priv,
+							  BLK_IDX_VLAN_LOOKUP,
+							  i, &vlan[match], false);
+			if (rc < 0)
+				return rc;
+		} else if (new_vlan[i].vlanid != VLAN_N_VID) {
+			/* Nothing changed, don't do anything */
+			if (match >= 0 &&
+			    vlan[match].vlanid == new_vlan[i].vlanid &&
+			    vlan[match].tag_port == new_vlan[i].tag_port &&
+			    vlan[match].vlan_bc == new_vlan[i].vlan_bc &&
+			    vlan[match].vmemb_port == new_vlan[i].vmemb_port)
+				continue;
+			/* Update entry */
+			dev_dbg(priv->ds->dev, "Updating VLAN %d\n", i);
+			rc = sja1105_dynamic_config_write(priv,
+							  BLK_IDX_VLAN_LOOKUP,
+							  i, &new_vlan[i],
+							  true);
+			if (rc < 0)
+				return rc;
+		}
+	}
+
+	if (table->entry_count)
+		kfree(table->entries);
+
+	table->entries = kcalloc(num_vlans, table->ops->unpacked_entry_size,
+				 GFP_KERNEL);
+	if (!table->entries)
+		return -ENOMEM;
+
+	table->entry_count = num_vlans;
+	vlan = table->entries;
+
+	for (i = 0; i < VLAN_N_VID; i++) {
+		if (new_vlan[i].vlanid == VLAN_N_VID)
+			continue;
+		vlan[k++] = new_vlan[i];
+	}
+
+	return 0;
+}
+
+struct sja1105_crosschip_switch {
+	struct list_head list;
+	struct dsa_switch *other_ds;
+};
+
+static int sja1105_commit_pvid(struct sja1105_private *priv)
+{
+	struct sja1105_bridge_vlan *v;
+	struct list_head *vlan_list;
+	int rc = 0;
+
+	if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL)
+		vlan_list = &priv->bridge_vlans;
+	else
+		vlan_list = &priv->dsa_8021q_vlans;
+
+	list_for_each_entry(v, vlan_list, list) {
+		if (v->pvid) {
+			rc = sja1105_pvid_apply(priv, v->port, v->vid);
+			if (rc)
+				break;
+		}
+	}
+
+	return rc;
+}
+
+static int
+sja1105_build_bridge_vlans(struct sja1105_private *priv,
+			   struct sja1105_vlan_lookup_entry *new_vlan)
+{
+	struct sja1105_bridge_vlan *v;
+
+	if (priv->vlan_state == SJA1105_VLAN_UNAWARE)
+		return 0;
+
+	list_for_each_entry(v, &priv->bridge_vlans, list) {
+		int match = v->vid;
+
+		new_vlan[match].vlanid = v->vid;
+		new_vlan[match].vmemb_port |= BIT(v->port);
+		new_vlan[match].vlan_bc |= BIT(v->port);
+		if (!v->untagged)
+			new_vlan[match].tag_port |= BIT(v->port);
+	}
+
+	return 0;
+}
+
+static int
+sja1105_build_dsa_8021q_vlans(struct sja1105_private *priv,
+			      struct sja1105_vlan_lookup_entry *new_vlan)
+{
+	struct sja1105_bridge_vlan *v;
+
+	if (priv->vlan_state == SJA1105_VLAN_FILTERING_FULL)
+		return 0;
+
+	list_for_each_entry(v, &priv->dsa_8021q_vlans, list) {
+		int match = v->vid;
+
+		new_vlan[match].vlanid = v->vid;
+		new_vlan[match].vmemb_port |= BIT(v->port);
+		new_vlan[match].vlan_bc |= BIT(v->port);
+		if (!v->untagged)
+			new_vlan[match].tag_port |= BIT(v->port);
+	}
+
+	return 0;
+}
+
+static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify);
+
+static int sja1105_notify_crosschip_switches(struct sja1105_private *priv)
+{
+	struct sja1105_crosschip_switch *s, *pos;
+	struct list_head crosschip_switches;
+	struct dsa_8021q_crosschip_link *c;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&crosschip_switches);
+
+	list_for_each_entry(c, &priv->crosschip_links, list) {
+		bool already_added = false;
+
+		list_for_each_entry(s, &crosschip_switches, list) {
+			if (s->other_ds == c->other_ds) {
+				already_added = true;
+				break;
+			}
+		}
+
+		if (already_added)
+			continue;
+
+		s = kzalloc(sizeof(*s), GFP_KERNEL);
+		if (!s) {
+			dev_err(priv->ds->dev, "Failed to allocate memory\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+		s->other_ds = c->other_ds;
+		list_add(&s->list, &crosschip_switches);
+	}
+
+	list_for_each_entry(s, &crosschip_switches, list) {
+		struct sja1105_private *other_priv = s->other_ds->priv;
+
+		rc = sja1105_build_vlan_table(other_priv, false);
+		if (rc)
+			goto out;
+	}
+
+out:
+	list_for_each_entry_safe(s, pos, &crosschip_switches, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+
+	return rc;
+}
+
+static int sja1105_build_vlan_table(struct sja1105_private *priv, bool notify)
+{
+	struct sja1105_vlan_lookup_entry *new_vlan;
+	struct sja1105_table *table;
+	int rc;
+	int i;
+
+	table = &priv->static_config.tables[BLK_IDX_VLAN_LOOKUP];
+	new_vlan = kcalloc(VLAN_N_VID,
+			   table->ops->unpacked_entry_size, GFP_KERNEL);
+	if (!new_vlan)
+		return -ENOMEM;
+
+	for (i = 0; i < VLAN_N_VID; i++)
+		new_vlan[i].vlanid = VLAN_N_VID;
+
+	/* Bridge VLANs */
+	rc = sja1105_build_bridge_vlans(priv, new_vlan);
+	if (rc)
+		goto out;
+
+	/* VLANs necessary for dsa_8021q operation, given to us by tag_8021q.c:
+	 * - RX VLANs
+	 * - TX VLANs
+	 * - Crosschip links
+	 */
+	rc = sja1105_build_dsa_8021q_vlans(priv, new_vlan);
+	if (rc)
+		goto out;
+
+	rc = sja1105_commit_vlans(priv, new_vlan);
+	if (rc)
+		goto out;
+
+	rc = sja1105_commit_pvid(priv);
+	if (rc)
+		goto out;
+
+	if (notify) {
+		rc = sja1105_notify_crosschip_switches(priv);
+		if (rc)
+			goto out;
+	}
+
+out:
+	kfree(new_vlan);
+
+	return rc;
+}
+
+/* Select the list to which we should add this VLAN. */
+static struct list_head *sja1105_classify_vlan(struct sja1105_private *priv,
+					       u16 vid)
+{
+	if (priv->expect_dsa_8021q)
+		return &priv->dsa_8021q_vlans;
+
+	return &priv->bridge_vlans;
+}
+
 static int sja1105_vlan_prepare(struct dsa_switch *ds, int port,
 				const struct switchdev_obj_port_vlan *vlan)
 {
@@ -2026,45 +2209,80 @@ static void sja1105_vlan_add(struct dsa_switch *ds, int port,
 			     const struct switchdev_obj_port_vlan *vlan)
 {
 	struct sja1105_private *priv = ds->priv;
+	bool vlan_table_changed = false;
 	u16 vid;
 	int rc;
 
 	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
-		rc = sja1105_vlan_apply(priv, port, vid, true, vlan->flags &
-					BRIDGE_VLAN_INFO_UNTAGGED);
-		if (rc < 0) {
-			dev_err(ds->dev, "Failed to add VLAN %d to port %d: %d\n",
-				vid, port, rc);
-			return;
-		}
-		if (vlan->flags & BRIDGE_VLAN_INFO_PVID) {
-			rc = sja1105_pvid_apply(ds->priv, port, vid);
-			if (rc < 0) {
-				dev_err(ds->dev, "Failed to set pvid %d on port %d: %d\n",
-					vid, port, rc);
-				return;
+		bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+		bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
+		struct sja1105_bridge_vlan *v;
+		struct list_head *vlan_list;
+		bool already_added = false;
+
+		vlan_list = sja1105_classify_vlan(priv, vid);
+
+		list_for_each_entry(v, vlan_list, list) {
+			if (v->port == port && v->vid == vid &&
+			    v->untagged == untagged && v->pvid == pvid) {
+				already_added = true;
+				break;
 			}
 		}
+
+		if (already_added)
+			continue;
+
+		v = kzalloc(sizeof(*v), GFP_KERNEL);
+		if (!v) {
+			dev_err(ds->dev, "Out of memory while storing VLAN\n");
+			return;
+		}
+
+		v->port = port;
+		v->vid = vid;
+		v->untagged = untagged;
+		v->pvid = pvid;
+		list_add(&v->list, vlan_list);
+
+		vlan_table_changed = true;
 	}
+
+	if (!vlan_table_changed)
+		return;
+
+	rc = sja1105_build_vlan_table(priv, true);
+	if (rc)
+		dev_err(ds->dev, "Failed to build VLAN table: %d\n", rc);
 }
 
 static int sja1105_vlan_del(struct dsa_switch *ds, int port,
 			    const struct switchdev_obj_port_vlan *vlan)
 {
 	struct sja1105_private *priv = ds->priv;
+	bool vlan_table_changed = false;
 	u16 vid;
-	int rc;
 
 	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
-		rc = sja1105_vlan_apply(priv, port, vid, false, vlan->flags &
-					BRIDGE_VLAN_INFO_UNTAGGED);
-		if (rc < 0) {
-			dev_err(ds->dev, "Failed to remove VLAN %d from port %d: %d\n",
-				vid, port, rc);
-			return rc;
+		struct sja1105_bridge_vlan *v, *n;
+		struct list_head *vlan_list;
+
+		vlan_list = sja1105_classify_vlan(priv, vid);
+
+		list_for_each_entry_safe(v, n, vlan_list, list) {
+			if (v->port == port && v->vid == vid) {
+				list_del(&v->list);
+				kfree(v);
+				vlan_table_changed = true;
+				break;
+			}
 		}
 	}
-	return 0;
+
+	if (!vlan_table_changed)
+		return 0;
+
+	return sja1105_build_vlan_table(priv, true);
 }
 
 /* The programming model for the SJA1105 switch is "all-at-once" via static
@@ -2142,6 +2360,7 @@ static int sja1105_setup(struct dsa_switch *ds)
 static void sja1105_teardown(struct dsa_switch *ds)
 {
 	struct sja1105_private *priv = ds->priv;
+	struct sja1105_bridge_vlan *v, *n;
 	int port;
 
 	for (port = 0; port < SJA1105_NUM_PORTS; port++) {
@@ -2158,6 +2377,16 @@ static void sja1105_teardown(struct dsa_switch *ds)
 	sja1105_tas_teardown(ds);
 	sja1105_ptp_clock_unregister(ds);
 	sja1105_static_config_free(&priv->static_config);
+
+	list_for_each_entry_safe(v, n, &priv->dsa_8021q_vlans, list) {
+		list_del(&v->list);
+		kfree(v);
+	}
+
+	list_for_each_entry_safe(v, n, &priv->bridge_vlans, list) {
+		list_del(&v->list);
+		kfree(v);
+	}
 }
 
 static int sja1105_port_enable(struct dsa_switch *ds, int port,
@@ -2598,6 +2827,8 @@ static int sja1105_probe(struct spi_device *spi)
 	mutex_init(&priv->mgmt_lock);
 
 	INIT_LIST_HEAD(&priv->crosschip_links);
+	INIT_LIST_HEAD(&priv->bridge_vlans);
+	INIT_LIST_HEAD(&priv->dsa_8021q_vlans);
 
 	sja1105_tas_setup(ds);
 	sja1105_flower_setup(ds);
diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index ebc245ff838a..404bd2cce642 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -25,18 +25,14 @@ struct dsa_8021q_crosschip_link {
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 				 bool enabled);
 
-int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
-				   struct dsa_switch *other_ds,
-				   int other_port, bool enabled);
-
 int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 				    struct dsa_switch *other_ds,
-				    int other_port, struct net_device *br,
+				    int other_port,
 				    struct list_head *crosschip_links);
 
 int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
 				     struct dsa_switch *other_ds,
-				     int other_port, struct net_device *br,
+				     int other_port,
 				     struct list_head *crosschip_links);
 
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
@@ -60,16 +56,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
 	return 0;
 }
 
-int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
-				   struct dsa_switch *other_ds,
-				   int other_port, bool enabled)
-{
-	return 0;
-}
-
 int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 				    struct dsa_switch *other_ds,
-				    int other_port, struct net_device *br,
+				    int other_port,
 				    struct list_head *crosschip_links)
 {
 	return 0;
@@ -77,7 +66,7 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 
 int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
 				     struct dsa_switch *other_ds,
-				     int other_port, struct net_device *br,
+				     int other_port,
 				     struct list_head *crosschip_links)
 {
 	return 0;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 4774ecd1f8fc..3236fbbf85b9 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -296,9 +296,9 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
 }
 EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
 
-int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
-				   struct dsa_switch *other_ds,
-				   int other_port, bool enabled)
+static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
+					  struct dsa_switch *other_ds,
+					  int other_port, bool enabled)
 {
 	u16 rx_vid = dsa_8021q_rx_vid(ds, port);
 
@@ -308,7 +308,6 @@ int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port,
 	return dsa_8021q_vid_apply(other_ds, other_port, rx_vid,
 				   BRIDGE_VLAN_INFO_UNTAGGED, enabled);
 }
-EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_link_apply);
 
 static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port,
 					struct dsa_switch *other_ds,
@@ -369,7 +368,7 @@ static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds,
  */
 int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 				    struct dsa_switch *other_ds,
-				    int other_port, struct net_device *br,
+				    int other_port,
 				    struct list_head *crosschip_links)
 {
 	/* @other_upstream is how @other_ds reaches us. If we are part
@@ -385,12 +384,10 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 	if (rc)
 		return rc;
 
-	if (!br_vlan_enabled(br)) {
-		rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
-						    other_port, true);
-		if (rc)
-			return rc;
-	}
+	rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+					    other_port, true);
+	if (rc)
+		return rc;
 
 	rc = dsa_8021q_crosschip_link_add(ds, port, other_ds,
 					  other_upstream,
@@ -398,20 +395,14 @@ int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port,
 	if (rc)
 		return rc;
 
-	if (!br_vlan_enabled(br)) {
-		rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds,
-						    other_upstream, true);
-		if (rc)
-			return rc;
-	}
-
-	return 0;
+	return dsa_8021q_crosschip_link_apply(ds, port, other_ds,
+					      other_upstream, true);
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join);
 
 int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
 				     struct dsa_switch *other_ds,
-				     int other_port, struct net_device *br,
+				     int other_port,
 				     struct list_head *crosschip_links)
 {
 	int other_upstream = dsa_upstream_port(other_ds, other_port);
@@ -431,14 +422,12 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port,
 			if (keep)
 				continue;
 
-			if (!br_vlan_enabled(br)) {
-				rc = dsa_8021q_crosschip_link_apply(ds, port,
-								    other_ds,
-								    other_port,
-								    false);
-				if (rc)
-					return rc;
-			}
+			rc = dsa_8021q_crosschip_link_apply(ds, port,
+							    other_ds,
+							    other_port,
+							    false);
+			if (rc)
+				return rc;
 		}
 	}
 
-- 
cgit v1.2.3


From 38b5beeae7a4cde87edabb0196fac1f55ae668ee Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 May 2020 20:20:32 +0300
Subject: net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN
 simultaneously

In VLAN-unaware mode, sja1105 uses VLAN tags with a custom TPID of
0xdadb. While in the yet-to-be introduced best_effort_vlan_filtering
mode, it needs to work with normal VLAN TPID values.

A complication arises when we must transmit a VLAN-tagged packet to the
switch when it's in VLAN-aware mode. We need to construct a packet with
2 VLAN tags, and the switch will use the outer header for routing and
pop it on egress. But sadly, here the 2 hardware generations don't
behave the same:

- E/T switches won't pop an ETH_P_8021AD tag on egress, it seems
  (packets will remain double-tagged).
- P/Q/R/S switches will drop a packet with 2 ETH_P_8021Q tags (it looks
  like it tries to prevent VLAN hopping).

But looks like the reverse is also true:

- E/T switches have no problem popping the outer tag from packets with
  2 ETH_P_8021Q tags.
- P/Q/R/S will have no problem popping a single tag even if that is
  ETH_P_8021AD.

So it is clear that if we want the hardware to work with dsa_8021q
tagging in VLAN-aware mode, we need to send different TPIDs depending on
revision. Keep that information in priv->info->qinq_tpid.

The per-port tagger structure will hold an xmit_tpid value that depends
not only upon the qinq_tpid, but also upon the VLAN awareness state
itself (in case we must transmit using 0xdadb).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h      |  6 ++++++
 drivers/net/dsa/sja1105/sja1105_main.c | 10 ++++++++++
 drivers/net/dsa/sja1105/sja1105_spi.c  |  6 ++++++
 include/linux/dsa/sja1105.h            |  1 +
 net/dsa/tag_sja1105.c                  | 32 +++++++++++++++++++++-----------
 5 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index c80f1999c694..a019ffae38f1 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -87,6 +87,12 @@ struct sja1105_info {
 	const struct sja1105_dynamic_table_ops *dyn_ops;
 	const struct sja1105_table_ops *static_ops;
 	const struct sja1105_regs *regs;
+	/* Both E/T and P/Q/R/S have quirks when it comes to popping the S-Tag
+	 * from double-tagged frames. E/T will pop it only when it's equal to
+	 * TPID from the General Parameters Table, while P/Q/R/S will only
+	 * pop it when it's equal to TPID2.
+	 */
+	u16 qinq_tpid;
 	int (*reset_cmd)(struct dsa_switch *ds);
 	int (*setup_rgmii_delay)(const void *ctx, int port);
 	/* Prototypes from include/net/dsa.h */
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 7b9c3db98e1d..b7e4a85caade 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2153,6 +2153,15 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled)
 		tpid2 = ETH_P_SJA1105;
 	}
 
+	for (port = 0; port < ds->num_ports; port++) {
+		struct sja1105_port *sp = &priv->ports[port];
+
+		if (enabled)
+			sp->xmit_tpid = priv->info->qinq_tpid;
+		else
+			sp->xmit_tpid = ETH_P_SJA1105;
+	}
+
 	if (!enabled)
 		state = SJA1105_VLAN_UNAWARE;
 	else
@@ -2866,6 +2875,7 @@ static int sja1105_probe(struct spi_device *spi)
 			goto out;
 		}
 		skb_queue_head_init(&sp->xmit_queue);
+		sp->xmit_tpid = ETH_P_SJA1105;
 	}
 
 	return 0;
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index 0be75c49e6c3..a0dacae803cc 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -512,6 +512,7 @@ struct sja1105_info sja1105e_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105e_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.qinq_tpid		= ETH_P_8021Q,
 	.ptp_ts_bits		= 24,
 	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
@@ -526,6 +527,7 @@ struct sja1105_info sja1105t_info = {
 	.part_no		= SJA1105ET_PART_NO,
 	.static_ops		= sja1105t_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
+	.qinq_tpid		= ETH_P_8021Q,
 	.ptp_ts_bits		= 24,
 	.ptpegr_ts_bytes	= 4,
 	.reset_cmd		= sja1105et_reset_cmd,
@@ -540,6 +542,7 @@ struct sja1105_info sja1105p_info = {
 	.part_no		= SJA1105P_PART_NO,
 	.static_ops		= sja1105p_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.qinq_tpid		= ETH_P_8021AD,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
 	.setup_rgmii_delay	= sja1105pqrs_setup_rgmii_delay,
@@ -555,6 +558,7 @@ struct sja1105_info sja1105q_info = {
 	.part_no		= SJA1105Q_PART_NO,
 	.static_ops		= sja1105q_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.qinq_tpid		= ETH_P_8021AD,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
 	.setup_rgmii_delay	= sja1105pqrs_setup_rgmii_delay,
@@ -570,6 +574,7 @@ struct sja1105_info sja1105r_info = {
 	.part_no		= SJA1105R_PART_NO,
 	.static_ops		= sja1105r_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
+	.qinq_tpid		= ETH_P_8021AD,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
 	.setup_rgmii_delay	= sja1105pqrs_setup_rgmii_delay,
@@ -586,6 +591,7 @@ struct sja1105_info sja1105s_info = {
 	.static_ops		= sja1105s_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.regs			= &sja1105pqrs_regs,
+	.qinq_tpid		= ETH_P_8021AD,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
 	.setup_rgmii_delay	= sja1105pqrs_setup_rgmii_delay,
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index fa5735c353cd..f821d08b1b5f 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -59,6 +59,7 @@ struct sja1105_port {
 	struct sja1105_tagger_data *data;
 	struct dsa_port *dp;
 	bool hwts_tx_en;
+	u16 xmit_tpid;
 };
 
 #endif /* _NET_DSA_SJA1105_H */
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5ecac5921a7d..398e2b9a1b96 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -69,12 +69,25 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb)
 	return true;
 }
 
+static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb)
+{
+	struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
+
+	if (hdr->h_vlan_proto == ntohs(ETH_P_SJA1105))
+		return true;
+
+	if (hdr->h_vlan_proto != ntohs(ETH_P_8021Q))
+		return false;
+
+	return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK);
+}
+
 /* This is the first time the tagger sees the frame on RX.
  * Figure out if we can decode it.
  */
 static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev)
 {
-	if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
+	if (sja1105_can_use_vlan_as_tags(skb))
 		return true;
 	if (sja1105_is_link_local(skb))
 		return true;
@@ -96,6 +109,11 @@ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp,
 	return NULL;
 }
 
+static u16 sja1105_xmit_tpid(struct sja1105_port *sp)
+{
+	return sp->xmit_tpid;
+}
+
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
@@ -111,15 +129,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 	if (unlikely(sja1105_is_link_local(skb)))
 		return sja1105_defer_xmit(dp->priv, skb);
 
-	/* If we are under a vlan_filtering bridge, IP termination on
-	 * switch ports based on 802.1Q tags is simply too brittle to
-	 * be passable. So just defer to the dsa_slave_notag_xmit
-	 * implementation.
-	 */
-	if (dsa_port_is_vlan_filtering(dp))
-		return skb;
-
-	return dsa_8021q_xmit(skb, netdev, ETH_P_SJA1105,
+	return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv),
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 }
 
@@ -258,7 +268,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 
 	hdr = eth_hdr(skb);
 	tpid = ntohs(hdr->h_proto);
-	is_tagged = (tpid == ETH_P_SJA1105);
+	is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q);
 	is_link_local = sja1105_is_link_local(skb);
 	is_meta = sja1105_is_meta_frame(skb);
 
-- 
cgit v1.2.3


From 3eaae1d05f2b5be1be834bfad64f8fc2ad39a56d Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 May 2020 20:20:33 +0300
Subject: net: dsa: tag_8021q: support up to 8 VLANs per port using sub-VLANs

For switches that support VLAN retagging, such as sja1105, we extend
dsa_8021q by encoding a "sub-VLAN" into the remaining 3 free bits in the
dsa_8021q tag.

A sub-VLAN is nothing more than a number in the range 0-7, which serves
as an index into a per-port driver lookup table. The sub-VLAN value of
zero means that traffic is untagged (this is also backwards-compatible
with dsa_8021q without retagging).

The switch should be configured to retag VLAN-tagged traffic that gets
transmitted towards the CPU port (and towards the CPU only). Example:

bridge vlan add dev sw1p0 vid 100

The switch retags frames received on port 0, going to the CPU, and
having VID 100, to the VID of 1104 (0x0450). In dsa_8021q language:

 | 11  | 10  |  9  |  8  |  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
 +-----------+-----+-----------------+-----------+-----------------------+
 |    DIR    | SVL |    SWITCH_ID    |  SUBVLAN  |          PORT         |
 +-----------+-----+-----------------+-----------+-----------------------+

0x0450 means:
 - DIR = 0b01: this is an RX VLAN
 - SUBVLAN = 0b001: this is subvlan #1
 - SWITCH_ID = 0b001: this is switch 1 (see the name "sw1p0")
 - PORT = 0b0000: this is port 0 (see the name "sw1p0")

The driver also remembers the "1 -> 100" mapping. In the hotpath, if the
sub-VLAN from the tag encodes a non-untagged frame, this mapping is used
to create a VLAN hwaccel tag, with the value of 100.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 16 ++++++++++++++
 net/dsa/tag_8021q.c       | 56 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 64 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 404bd2cce642..311aa04e7520 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -20,6 +20,8 @@ struct dsa_8021q_crosschip_link {
 	refcount_t refcount;
 };
 
+#define DSA_8021Q_N_SUBVLAN			8
+
 #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q)
 
 int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
@@ -42,10 +44,14 @@ u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
 
+u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan);
+
 int dsa_8021q_rx_switch_id(u16 vid);
 
 int dsa_8021q_rx_source_port(u16 vid);
 
+u16 dsa_8021q_rx_subvlan(u16 vid);
+
 bool vid_is_dsa_8021q(u16 vid);
 
 #else
@@ -88,6 +94,11 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
 	return 0;
 }
 
+u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan)
+{
+	return 0;
+}
+
 int dsa_8021q_rx_switch_id(u16 vid)
 {
 	return 0;
@@ -98,6 +109,11 @@ int dsa_8021q_rx_source_port(u16 vid)
 	return 0;
 }
 
+u16 dsa_8021q_rx_subvlan(u16 vid)
+{
+	return 0;
+}
+
 bool vid_is_dsa_8021q(u16 vid)
 {
 	return false;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 3236fbbf85b9..3052da668156 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -17,7 +17,7 @@
  *
  * | 11  | 10  |  9  |  8  |  7  |  6  |  5  |  4  |  3  |  2  |  1  |  0  |
  * +-----------+-----+-----------------+-----------+-----------------------+
- * |    DIR    | RSV |    SWITCH_ID    |    RSV    |          PORT         |
+ * |    DIR    | SVL |    SWITCH_ID    |  SUBVLAN  |          PORT         |
  * +-----------+-----+-----------------+-----------+-----------------------+
  *
  * DIR - VID[11:10]:
@@ -27,17 +27,24 @@
  *	These values make the special VIDs of 0, 1 and 4095 to be left
  *	unused by this coding scheme.
  *
- * RSV - VID[9]:
- *	To be used for further expansion of SWITCH_ID or for other purposes.
- *	Must be transmitted as zero and ignored on receive.
+ * SVL/SUBVLAN - { VID[9], VID[5:4] }:
+ *	Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN.
+ *	* 0 (0b000): Field does not encode a sub-VLAN, either because
+ *	received traffic is untagged, PVID-tagged or because a second
+ *	VLAN tag is present after this tag and not inside of it.
+ *	* 1 (0b001): Received traffic is tagged with a VID value private
+ *	to the host. This field encodes the index in the host's lookup
+ *	table through which the value of the ingress VLAN ID can be
+ *	recovered.
+ *	* 2 (0b010): Field encodes a sub-VLAN.
+ *	...
+ *	* 7 (0b111): Field encodes a sub-VLAN.
+ *	When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero
+ *	(by the host) and ignored on receive (by the switch).
  *
  * SWITCH_ID - VID[8:6]:
  *	Index of switch within DSA tree. Must be between 0 and 7.
  *
- * RSV - VID[5:4]:
- *	To be used for further expansion of PORT or for other purposes.
- *	Must be transmitted as zero and ignored on receive.
- *
  * PORT - VID[3:0]:
  *	Index of switch port. Must be between 0 and 15.
  */
@@ -54,6 +61,18 @@
 #define DSA_8021Q_SWITCH_ID(x)		(((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
 						 DSA_8021Q_SWITCH_ID_MASK)
 
+#define DSA_8021Q_SUBVLAN_HI_SHIFT	9
+#define DSA_8021Q_SUBVLAN_HI_MASK	GENMASK(9, 9)
+#define DSA_8021Q_SUBVLAN_LO_SHIFT	4
+#define DSA_8021Q_SUBVLAN_LO_MASK	GENMASK(4, 3)
+#define DSA_8021Q_SUBVLAN_HI(x)		(((x) & GENMASK(2, 2)) >> 2)
+#define DSA_8021Q_SUBVLAN_LO(x)		((x) & GENMASK(1, 0))
+#define DSA_8021Q_SUBVLAN(x)		\
+		(((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \
+		  DSA_8021Q_SUBVLAN_LO_MASK) | \
+		 ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \
+		  DSA_8021Q_SUBVLAN_HI_MASK))
+
 #define DSA_8021Q_PORT_SHIFT		0
 #define DSA_8021Q_PORT_MASK		GENMASK(3, 0)
 #define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
@@ -79,6 +98,13 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid);
 
+u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan)
+{
+	return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) |
+	       DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan);
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan);
+
 /* Returns the decoded switch ID from the RX VID. */
 int dsa_8021q_rx_switch_id(u16 vid)
 {
@@ -93,6 +119,20 @@ int dsa_8021q_rx_source_port(u16 vid)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
 
+/* Returns the decoded subvlan from the RX VID. */
+u16 dsa_8021q_rx_subvlan(u16 vid)
+{
+	u16 svl_hi, svl_lo;
+
+	svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >>
+		 DSA_8021Q_SUBVLAN_HI_SHIFT;
+	svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >>
+		 DSA_8021Q_SUBVLAN_LO_SHIFT;
+
+	return (svl_hi << 2) | svl_lo;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan);
+
 bool vid_is_dsa_8021q(u16 vid)
 {
 	return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX ||
-- 
cgit v1.2.3


From 84eeb5d460e399795e9a92a0cd44999254886150 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 12 May 2020 20:20:34 +0300
Subject: net: dsa: tag_sja1105: implement sub-VLAN decoding

Create a subvlan_map as part of each port's tagger private structure.
This keeps reverse mappings of bridge-to-dsa_8021q VLAN retagging rules.

Note that as of this patch, this piece of code is never engaged, due to
the fact that the driver hasn't installed any retagging rule, so we'll
always see packets with a subvlan code of 0 (untagged).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c |  4 ++++
 include/linux/dsa/sja1105.h            |  2 ++
 net/dsa/tag_sja1105.c                  | 19 +++++++++++++++++++
 3 files changed, 25 insertions(+)

(limited to 'net')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index b7e4a85caade..fd15a18596ea 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2856,6 +2856,7 @@ static int sja1105_probe(struct spi_device *spi)
 		struct sja1105_port *sp = &priv->ports[port];
 		struct dsa_port *dp = dsa_to_port(ds, port);
 		struct net_device *slave;
+		int subvlan;
 
 		if (!dsa_is_user_port(ds, port))
 			continue;
@@ -2876,6 +2877,9 @@ static int sja1105_probe(struct spi_device *spi)
 		}
 		skb_queue_head_init(&sp->xmit_queue);
 		sp->xmit_tpid = ETH_P_SJA1105;
+
+		for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++)
+			sp->subvlan_map[subvlan] = VLAN_N_VID;
 	}
 
 	return 0;
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index f821d08b1b5f..dd93735ae228 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -9,6 +9,7 @@
 
 #include <linux/skbuff.h>
 #include <linux/etherdevice.h>
+#include <linux/dsa/8021q.h>
 #include <net/dsa.h>
 
 #define ETH_P_SJA1105				ETH_P_DSA_8021Q
@@ -53,6 +54,7 @@ struct sja1105_skb_cb {
 	((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb))
 
 struct sja1105_port {
+	u16 subvlan_map[DSA_8021Q_N_SUBVLAN];
 	struct kthread_worker *xmit_worker;
 	struct kthread_work xmit_work;
 	struct sk_buff_head xmit_queue;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 398e2b9a1b96..ad105550b145 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -254,6 +254,20 @@ static struct sk_buff
 	return skb;
 }
 
+static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan)
+{
+	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+	struct sja1105_port *sp = dp->priv;
+	u16 vid = sp->subvlan_map[subvlan];
+	u16 vlan_tci;
+
+	if (vid == VLAN_N_VID)
+		return;
+
+	vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid;
+	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
+}
+
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
@@ -263,6 +277,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	struct ethhdr *hdr;
 	u16 tpid, vid, tci;
 	bool is_link_local;
+	u16 subvlan = 0;
 	bool is_tagged;
 	bool is_meta;
 
@@ -286,6 +301,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 		source_port = dsa_8021q_rx_source_port(vid);
 		switch_id = dsa_8021q_rx_switch_id(vid);
 		skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+		subvlan = dsa_8021q_rx_subvlan(vid);
 	} else if (is_link_local) {
 		/* Management traffic path. Switch embeds the switch ID and
 		 * port ID into bytes of the destination MAC, courtesy of
@@ -310,6 +326,9 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 		return NULL;
 	}
 
+	if (subvlan)
+		sja1105_decode_subvlan(skb, subvlan);
+
 	return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
 					      is_meta);
 }
-- 
cgit v1.2.3


From 51fa960d3b5163b1af22efdebcabfccc5d615ad6 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 12 May 2020 10:36:23 -0700
Subject: erspan: Check IFLA_GRE_ERSPAN_VER is set.

Add a check to make sure the IFLA_GRE_ERSPAN_VER is provided by users.

Fixes: f989d546a2d5 ("erspan: Add type I version 0 support.")
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index e29cd48674d7..0ce9b91ff55c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1087,7 +1087,8 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
 	if (ret)
 		return ret;
 
-	if (nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
+	if (data[IFLA_GRE_ERSPAN_VER] &&
+	    nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
 		return 0;
 
 	/* ERSPAN type II/III should only have GRE sequence and key flag */
-- 
cgit v1.2.3


From fb9f2e92864f51d25e790947cca2ac4426a12f9c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Wed, 13 May 2020 03:23:27 +0300
Subject: net: dsa: tag_sja1105: appease sparse checks for ethertype accessors

A comparison between a value from the packet and an integer constant
value needs to be done by converting the value from the packet from
net->host, or the constant from host->net. Not the other way around.
Even though it makes no practical difference, correct that.

Fixes: 38b5beeae7a4 ("net: dsa: sja1105: prepare tagger for handling DSA tags and VLAN simultaneously")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_sja1105.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index ad105550b145..9b4a4d719291 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -73,10 +73,10 @@ static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb)
 {
 	struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
 
-	if (hdr->h_vlan_proto == ntohs(ETH_P_SJA1105))
+	if (hdr->h_vlan_proto == htons(ETH_P_SJA1105))
 		return true;
 
-	if (hdr->h_vlan_proto != ntohs(ETH_P_8021Q))
+	if (hdr->h_vlan_proto != htons(ETH_P_8021Q))
 		return false;
 
 	return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK);
-- 
cgit v1.2.3


From aaebf8e6088270e45d30314031b5d9a88a589cb9 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 12 May 2020 19:09:32 -0700
Subject: Bluetooth: Fix incorrect type for window and interval

The types for window and interval should be uint16, not uint8.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 3f470f0e432c..f6870e98faab 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -890,7 +890,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 	struct hci_dev *hdev = req->hdev;
 	u8 own_addr_type;
 	u8 filter_policy;
-	u8 window, interval;
+	u16 window, interval;
 
 	if (hdev->scanning_paused) {
 		bt_dev_dbg(hdev, "Scanning is paused for suspend");
-- 
cgit v1.2.3


From 91779665c129d361c032d2a590b37a077b6cb9d7 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 12 May 2020 19:09:33 -0700
Subject: Bluetooth: Modify LE window and interval for suspend

When a device is suspended, it doesn't need to be as responsive to
connection events. Increase the interval to 640ms (creating a duty cycle
of roughly 1.75%) so that passive scanning uses much less power (vs
previous duty cycle of 18.75%). The new window + interval combination
has been tested to work with HID devices (which are currently the only
devices capable of wake up).

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index f6870e98faab..6b45e31432a7 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -35,7 +35,7 @@
 #define HCI_REQ_CANCELED  2
 
 #define LE_SUSPEND_SCAN_WINDOW		0x0012
-#define LE_SUSPEND_SCAN_INTERVAL	0x0060
+#define LE_SUSPEND_SCAN_INTERVAL	0x0400
 
 void hci_req_init(struct hci_request *req, struct hci_dev *hdev)
 {
-- 
cgit v1.2.3


From 0d2c9825e46d45f8a520135c9c791b5c73a165ab Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 12 May 2020 19:19:25 -0700
Subject: Bluetooth: Rename BT_SUSPEND_COMPLETE

Renamed BT_SUSPEND_COMPLETE to BT_SUSPEND_CONFIGURE_WAKE since it sets
up the event filter and whitelist for wake-up.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Reviewed-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 2 +-
 net/bluetooth/hci_core.c         | 2 +-
 net/bluetooth/hci_request.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 0c7f3ad76665..869ee2b30a4c 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -110,7 +110,7 @@ enum suspend_tasks {
 enum suspended_state {
 	BT_RUNNING = 0,
 	BT_SUSPEND_DISCONNECT,
-	BT_SUSPEND_COMPLETE,
+	BT_SUSPEND_CONFIGURE_WAKE,
 };
 
 struct hci_conn_hash {
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 51d399273276..de1f4e72ec06 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3353,7 +3353,7 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
 		/* Only configure whitelist if disconnect succeeded */
 		if (!ret)
 			ret = hci_change_suspend_state(hdev,
-						       BT_SUSPEND_COMPLETE);
+						BT_SUSPEND_CONFIGURE_WAKE);
 	} else if (action == PM_POST_SUSPEND) {
 		ret = hci_change_suspend_state(hdev, BT_RUNNING);
 	}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 6b45e31432a7..1fc55685da62 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1090,7 +1090,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
 				   disconnect_counter);
 			set_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks);
 		}
-	} else if (next == BT_SUSPEND_COMPLETE) {
+	} else if (next == BT_SUSPEND_CONFIGURE_WAKE) {
 		/* Unpause to take care of updating scanning params */
 		hdev->scanning_paused = false;
 		/* Enable event filter for paired devices */
-- 
cgit v1.2.3


From 81dafad53c67abe4f09b0b04131fe490e76e5602 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 12 May 2020 19:19:26 -0700
Subject: Bluetooth: Add hook for driver to prevent wake from suspend

Let drivers have a hook to disable configuring scanning during suspend.
Drivers should use the device_may_wakeup function call to determine
whether hci should be configured for wakeup.

For example, an implementation for btusb may look like the following:

  bool btusb_prevent_wake(struct hci_dev *hdev)
  {
        struct btusb_data *data = hci_get_drvdata(hdev);
        return !device_may_wakeup(&data->udev->dev);
  }

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Reviewed-by: Alain Michaud <alainm@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 1 +
 net/bluetooth/hci_core.c         | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 869ee2b30a4c..5dcf85f186c6 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -503,6 +503,7 @@ struct hci_dev {
 	int (*set_diag)(struct hci_dev *hdev, bool enable);
 	int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr);
 	void (*cmd_timeout)(struct hci_dev *hdev);
+	bool (*prevent_wake)(struct hci_dev *hdev);
 };
 
 #define HCI_PHY_HANDLE(handle)	(handle & 0xff)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index de1f4e72ec06..dbe2d79f233f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3350,8 +3350,10 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
 		 */
 		ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT);
 
-		/* Only configure whitelist if disconnect succeeded */
-		if (!ret)
+		/* Only configure whitelist if disconnect succeeded and wake
+		 * isn't being prevented.
+		 */
+		if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev)))
 			ret = hci_change_suspend_state(hdev,
 						BT_SUSPEND_CONFIGURE_WAKE);
 	} else if (action == PM_POST_SUSPEND) {
-- 
cgit v1.2.3


From adf1d6926444029396861413aba8a0f2a805742a Mon Sep 17 00:00:00 2001
From: Sonny Sasaka <sonnysasaka@chromium.org>
Date: Wed, 6 May 2020 12:55:03 -0700
Subject: Bluetooth: Handle Inquiry Cancel error after Inquiry Complete

After sending Inquiry Cancel command to the controller, it is possible
that Inquiry Complete event comes before Inquiry Cancel command complete
event. In this case the Inquiry Cancel command will have status of
Command Disallowed since there is no Inquiry session to be cancelled.
This case should not be treated as error, otherwise we can reach an
inconsistent state.

Example of a btmon trace when this happened:

< HCI Command: Inquiry Cancel (0x01|0x0002) plen 0
> HCI Event: Inquiry Complete (0x01) plen 1
        Status: Success (0x00)
> HCI Event: Command Complete (0x0e) plen 4
      Inquiry Cancel (0x01|0x0002) ncmd 1
        Status: Command Disallowed (0x0c)

Signed-off-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 006c24e04b44..73aabca0064b 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -42,12 +42,27 @@
 
 /* Handle HCI Event packets */
 
-static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb,
+				  u8 *new_status)
 {
 	__u8 status = *((__u8 *) skb->data);
 
 	BT_DBG("%s status 0x%2.2x", hdev->name, status);
 
+	/* It is possible that we receive Inquiry Complete event right
+	 * before we receive Inquiry Cancel Command Complete event, in
+	 * which case the latter event should have status of Command
+	 * Disallowed (0x0c). This should not be treated as error, since
+	 * we actually achieve what Inquiry Cancel wants to achieve,
+	 * which is to end the last Inquiry session.
+	 */
+	if (status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+		bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
+		status = 0x00;
+	}
+
+	*new_status = status;
+
 	if (status)
 		return;
 
@@ -3233,7 +3248,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
 
 	switch (*opcode) {
 	case HCI_OP_INQUIRY_CANCEL:
-		hci_cc_inquiry_cancel(hdev, skb);
+		hci_cc_inquiry_cancel(hdev, skb, status);
 		break;
 
 	case HCI_OP_PERIODIC_INQ:
-- 
cgit v1.2.3


From 5b440676c15bbe1a40f2546ec92db83ed66d9e22 Mon Sep 17 00:00:00 2001
From: Archie Pusaka <apusaka@chromium.org>
Date: Tue, 14 Apr 2020 16:08:40 +0800
Subject: Bluetooth: L2CAP: add support for waiting disconnection resp

Whenever we disconnect a L2CAP connection, we would immediately
report a disconnection event (EPOLLHUP) to the upper layer, without
waiting for the response of the other device.

This patch offers an option to wait until we receive a disconnection
response before reporting disconnection event, by using the "how"
parameter in l2cap_sock_shutdown(). Therefore, upper layer can opt
to wait for disconnection response by shutdown(sock, SHUT_WR).

This can be used to enforce proper disconnection order in HID,
where the disconnection of the interrupt channel must be complete
before attempting to disconnect the control channel.

Signed-off-by: Archie Pusaka <apusaka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/l2cap_sock.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 1cea42ee1e92..a995d2c51fa7 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1271,14 +1271,21 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
 	struct l2cap_conn *conn;
 	int err = 0;
 
-	BT_DBG("sock %p, sk %p", sock, sk);
+	BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
+
+	/* 'how' parameter is mapped to sk_shutdown as follows:
+	 * SHUT_RD   (0) --> RCV_SHUTDOWN  (1)
+	 * SHUT_WR   (1) --> SEND_SHUTDOWN (2)
+	 * SHUT_RDWR (2) --> SHUTDOWN_MASK (3)
+	 */
+	how++;
 
 	if (!sk)
 		return 0;
 
 	lock_sock(sk);
 
-	if (sk->sk_shutdown)
+	if ((sk->sk_shutdown & how) == how)
 		goto shutdown_already;
 
 	BT_DBG("Handling sock shutdown");
@@ -1301,11 +1308,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
 		 * has already been actioned to close the L2CAP
 		 * link such as by l2cap_disconnection_req().
 		 */
-		if (sk->sk_shutdown)
-			goto has_shutdown;
+		if ((sk->sk_shutdown & how) == how)
+			goto shutdown_matched;
 	}
 
-	sk->sk_shutdown = SHUTDOWN_MASK;
+	/* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN
+	 * is already set
+	 */
+	if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		sk->sk_shutdown |= RCV_SHUTDOWN;
+		if ((sk->sk_shutdown & how) == how)
+			goto shutdown_matched;
+	}
+
+	sk->sk_shutdown |= SEND_SHUTDOWN;
 	release_sock(sk);
 
 	l2cap_chan_lock(chan);
@@ -1335,7 +1351,7 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
 		err = bt_sock_wait_state(sk, BT_CLOSED,
 					 sk->sk_lingertime);
 
-has_shutdown:
+shutdown_matched:
 	l2cap_chan_put(chan);
 	sock_put(sk);
 
@@ -1363,7 +1379,7 @@ static int l2cap_sock_release(struct socket *sock)
 
 	bt_sock_unlink(&l2cap_sk_list, sk);
 
-	err = l2cap_sock_shutdown(sock, 2);
+	err = l2cap_sock_shutdown(sock, SHUT_RDWR);
 	chan = l2cap_pi(sk)->chan;
 
 	l2cap_chan_hold(chan);
-- 
cgit v1.2.3


From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 13 May 2020 11:02:19 -0700
Subject: bpf: net: Refactor bpf_iter target registration

Currently bpf_iter_reg_target takes parameters from target
and allocates memory to save them. This is really not
necessary, esp. in the future we may grow information
passed from targets to bpf_iter manager.

The patch refactors the code so target reg_info
becomes static and bpf_iter manager can just take
a reference to it.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com
---
 include/linux/bpf.h      |  2 +-
 kernel/bpf/bpf_iter.c    | 36 +++++++++++++++++-------------------
 kernel/bpf/map_iter.c    | 18 +++++++++---------
 kernel/bpf/task_iter.c   | 30 ++++++++++++++++--------------
 net/ipv6/route.c         | 18 +++++++++---------
 net/netlink/af_netlink.c | 18 +++++++++---------
 6 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ab94dfd8826f..6fa773e2d1bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1153,7 +1153,7 @@ struct bpf_iter_meta {
 	u64 seq_num;
 };
 
-int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const char *target);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0a45a6cdfabd..051fb8cab62a 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -8,11 +8,7 @@
 
 struct bpf_iter_target_info {
 	struct list_head list;
-	const char *target;
-	const struct seq_operations *seq_ops;
-	bpf_iter_init_seq_priv_t init_seq_private;
-	bpf_iter_fini_seq_priv_t fini_seq_private;
-	u32 seq_priv_size;
+	const struct bpf_iter_reg *reg_info;
 	u32 btf_id;	/* cached value */
 };
 
@@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
 				 target_private);
 
-	if (iter_priv->tinfo->fini_seq_private)
-		iter_priv->tinfo->fini_seq_private(seq->private);
+	if (iter_priv->tinfo->reg_info->fini_seq_private)
+		iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
 
 	bpf_prog_put(iter_priv->prog);
 	seq->private = iter_priv;
@@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = {
 	.release	= iter_release,
 };
 
-int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
+/* The argument reg_info will be cached in bpf_iter_target_info.
+ * The common practice is to declare target reg_info as
+ * a const static variable and passed as an argument to
+ * bpf_iter_reg_target().
+ */
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
 
@@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
 	if (!tinfo)
 		return -ENOMEM;
 
-	tinfo->target = reg_info->target;
-	tinfo->seq_ops = reg_info->seq_ops;
-	tinfo->init_seq_private = reg_info->init_seq_private;
-	tinfo->fini_seq_private = reg_info->fini_seq_private;
-	tinfo->seq_priv_size = reg_info->seq_priv_size;
+	tinfo->reg_info = reg_info;
 	INIT_LIST_HEAD(&tinfo->list);
 
 	mutex_lock(&targets_mutex);
@@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target)
 
 	mutex_lock(&targets_mutex);
 	list_for_each_entry(tinfo, &targets, list) {
-		if (!strcmp(target, tinfo->target)) {
+		if (!strcmp(target, tinfo->reg_info->target)) {
 			list_del(&tinfo->list);
 			kfree(tinfo);
 			found = true;
@@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 			supported = true;
 			break;
 		}
-		if (!strcmp(attach_fname + prefix_len, tinfo->target)) {
+		if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
 			cache_btf_id(tinfo, prog);
 			supported = true;
 			break;
@@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 
 	tinfo = link->tinfo;
 	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
-			   tinfo->seq_priv_size;
-	priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize);
+			   tinfo->reg_info->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+				       total_priv_dsize);
 	if (!priv_data) {
 		err = -ENOMEM;
 		goto release_prog;
 	}
 
-	if (tinfo->init_seq_private) {
-		err = tinfo->init_seq_private(priv_data->target_private);
+	if (tinfo->reg_info->init_seq_private) {
+		err = tinfo->reg_info->init_seq_private(priv_data->target_private);
 		if (err)
 			goto release_seq_file;
 	}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 8162e0c00b9f..c6216a5fe56e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = {
 	.show	= bpf_map_seq_show,
 };
 
+static const struct bpf_iter_reg bpf_map_reg_info = {
+	.target			= "bpf_map",
+	.seq_ops		= &bpf_map_seq_ops,
+	.init_seq_private	= NULL,
+	.fini_seq_private	= NULL,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+};
+
 static int __init bpf_map_iter_init(void)
 {
-	struct bpf_iter_reg reg_info = {
-		.target			= "bpf_map",
-		.seq_ops		= &bpf_map_seq_ops,
-		.init_seq_private	= NULL,
-		.fini_seq_private	= NULL,
-		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
-	};
-
-	return bpf_iter_reg_target(&reg_info);
+	return bpf_iter_reg_target(&bpf_map_reg_info);
 }
 
 late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index aeed662d8451..bd7bfd83d9e0 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = {
 	.show	= task_file_seq_show,
 };
 
+static const struct bpf_iter_reg task_reg_info = {
+	.target			= "task",
+	.seq_ops		= &task_seq_ops,
+	.init_seq_private	= init_seq_pidns,
+	.fini_seq_private	= fini_seq_pidns,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+};
+
+static const struct bpf_iter_reg task_file_reg_info = {
+	.target			= "task_file",
+	.seq_ops		= &task_file_seq_ops,
+	.init_seq_private	= init_seq_pidns,
+	.fini_seq_private	= fini_seq_pidns,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+};
+
 static int __init task_iter_init(void)
 {
-	struct bpf_iter_reg task_file_reg_info = {
-		.target			= "task_file",
-		.seq_ops		= &task_file_seq_ops,
-		.init_seq_private	= init_seq_pidns,
-		.fini_seq_private	= fini_seq_pidns,
-		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
-	};
-	struct bpf_iter_reg task_reg_info = {
-		.target			= "task",
-		.seq_ops		= &task_seq_ops,
-		.init_seq_private	= init_seq_pidns,
-		.fini_seq_private	= fini_seq_pidns,
-		.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
-	};
 	int ret;
 
 	ret = bpf_iter_reg_target(&task_reg_info);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 25f6d3e619d0..6ad2fa51a23a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6397,17 +6397,17 @@ void __init ip6_route_init_special_entries(void)
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
 
+static const struct bpf_iter_reg ipv6_route_reg_info = {
+	.target			= "ipv6_route",
+	.seq_ops		= &ipv6_route_seq_ops,
+	.init_seq_private	= bpf_iter_init_seq_net,
+	.fini_seq_private	= bpf_iter_fini_seq_net,
+	.seq_priv_size		= sizeof(struct ipv6_route_iter),
+};
+
 static int __init bpf_iter_register(void)
 {
-	struct bpf_iter_reg reg_info = {
-		.target			= "ipv6_route",
-		.seq_ops		= &ipv6_route_seq_ops,
-		.init_seq_private	= bpf_iter_init_seq_net,
-		.fini_seq_private	= bpf_iter_fini_seq_net,
-		.seq_priv_size		= sizeof(struct ipv6_route_iter),
-	};
-
-	return bpf_iter_reg_target(&reg_info);
+	return bpf_iter_reg_target(&ipv6_route_reg_info);
 }
 
 static void bpf_iter_unregister(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 33cda9baa979..839827227e98 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2803,17 +2803,17 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+static const struct bpf_iter_reg netlink_reg_info = {
+	.target			= "netlink",
+	.seq_ops		= &netlink_seq_ops,
+	.init_seq_private	= bpf_iter_init_seq_net,
+	.fini_seq_private	= bpf_iter_fini_seq_net,
+	.seq_priv_size		= sizeof(struct nl_seq_iter),
+};
+
 static int __init bpf_iter_register(void)
 {
-	struct bpf_iter_reg reg_info = {
-		.target			= "netlink",
-		.seq_ops		= &netlink_seq_ops,
-		.init_seq_private	= bpf_iter_init_seq_net,
-		.fini_seq_private	= bpf_iter_fini_seq_net,
-		.seq_priv_size		= sizeof(struct nl_seq_iter),
-	};
-
-	return bpf_iter_reg_target(&reg_info);
+	return bpf_iter_reg_target(&netlink_reg_info);
 }
 #endif
 
-- 
cgit v1.2.3


From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 13 May 2020 11:02:20 -0700
Subject: bpf: Change func bpf_iter_unreg_target() signature

Change func bpf_iter_unreg_target() parameter from target
name to target reg_info, similar to bpf_iter_reg_target().

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com
---
 include/linux/bpf.h   | 2 +-
 kernel/bpf/bpf_iter.c | 4 ++--
 net/ipv6/route.c      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6fa773e2d1bf..534174eca86b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1154,7 +1154,7 @@ struct bpf_iter_meta {
 };
 
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
-void bpf_iter_unreg_target(const char *target);
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 051fb8cab62a..644f8626b2c0 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
 	return 0;
 }
 
-void bpf_iter_unreg_target(const char *target)
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
 {
 	struct bpf_iter_target_info *tinfo;
 	bool found = false;
 
 	mutex_lock(&targets_mutex);
 	list_for_each_entry(tinfo, &targets, list) {
-		if (!strcmp(target, tinfo->reg_info->target)) {
+		if (reg_info == tinfo->reg_info) {
 			list_del(&tinfo->list);
 			kfree(tinfo);
 			found = true;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6ad2fa51a23a..22bf4e36c093 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6412,7 +6412,7 @@ static int __init bpf_iter_register(void)
 
 static void bpf_iter_unregister(void)
 {
-	bpf_iter_unreg_target("ipv6_route");
+	bpf_iter_unreg_target(&ipv6_route_reg_info);
 }
 #endif
 #endif
-- 
cgit v1.2.3


From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 13 May 2020 11:02:21 -0700
Subject: bpf: Enable bpf_iter targets registering ctx argument types

Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL
support") adds a field btf_id_or_null_non0_off to
bpf_prog->aux structure to indicate that the
first ctx argument is PTR_TO_BTF_ID reg_type and
all others are PTR_TO_BTF_ID_OR_NULL.
This approach does not really scale if we have
other different reg types in the future, e.g.,
a pointer to a buffer.

This patch enables bpf_iter targets registering ctx argument
reg types which may be different from the default one.
For example, for pointers to structures, the default reg_type
is PTR_TO_BTF_ID for tracing program. The target can register
a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can
be used by the verifier to enforce accesses.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com
---
 include/linux/bpf.h      | 12 +++++++++++-
 include/net/ip6_fib.h    |  7 +++++++
 kernel/bpf/bpf_iter.c    |  5 +++++
 kernel/bpf/btf.c         | 15 ++++++++++-----
 kernel/bpf/map_iter.c    |  5 +++++
 kernel/bpf/task_iter.c   | 12 ++++++++++++
 kernel/bpf/verifier.c    |  1 -
 net/ipv6/ip6_fib.c       |  5 -----
 net/ipv6/route.c         |  5 +++++
 net/netlink/af_netlink.c |  5 +++++
 10 files changed, 60 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 534174eca86b..c45d198ac38c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -643,6 +643,12 @@ struct bpf_jit_poke_descriptor {
 	u16 reason;
 };
 
+/* reg_type info for ctx arguments */
+struct bpf_ctx_arg_aux {
+	u32 offset;
+	enum bpf_reg_type reg_type;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -654,12 +660,13 @@ struct bpf_prog_aux {
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
+	u32 ctx_arg_info_size;
+	const struct bpf_ctx_arg_aux *ctx_arg_info;
 	struct bpf_prog *linked_prog;
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
 	bool func_proto_unreliable;
-	bool btf_id_or_null_non0_off;
 	enum bpf_tramp_prog_type trampoline_prog_type;
 	struct bpf_trampoline *trampoline;
 	struct hlist_node tramp_hlist;
@@ -1139,12 +1146,15 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 
+#define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
 	const struct seq_operations *seq_ops;
 	bpf_iter_init_seq_priv_t init_seq_private;
 	bpf_iter_fini_seq_priv_t fini_seq_private;
 	u32 seq_priv_size;
+	u32 ctx_arg_info_size;
+	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
 };
 
 struct bpf_iter_meta {
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80262d2980f5..870b646c5797 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -540,6 +540,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
 }
 
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_iter__ipv6_route {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct fib6_info *, rt);
+};
+#endif
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 static inline bool fib6_has_custom_rules(const struct net *net)
 {
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 644f8626b2c0..dd612b80b9fe 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 	}
 	mutex_unlock(&targets_mutex);
 
+	if (supported) {
+		prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
+		prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
+	}
+
 	return supported;
 }
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dcd233139294..58c9af1d4808 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	struct bpf_verifier_log *log = info->log;
 	const struct btf_param *args;
 	u32 nr_args, arg;
-	int ret;
+	int i, ret;
 
 	if (off % 8) {
 		bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		return true;
 
 	/* this is a pointer to another type */
-	if (off != 0 && prog->aux->btf_id_or_null_non0_off)
-		info->reg_type = PTR_TO_BTF_ID_OR_NULL;
-	else
-		info->reg_type = PTR_TO_BTF_ID;
+	info->reg_type = PTR_TO_BTF_ID;
+	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+		if (ctx_arg_info->offset == off) {
+			info->reg_type = ctx_arg_info->reg_type;
+			break;
+		}
+	}
 
 	if (tgt_prog) {
 		ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index c6216a5fe56e..c69071e334bf 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = {
 	.init_seq_private	= NULL,
 	.fini_seq_private	= NULL,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_map, map),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
 };
 
 static int __init bpf_map_iter_init(void)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index bd7bfd83d9e0..a9b7264dda08 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = {
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__task, task),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
 };
 
 static const struct bpf_iter_reg task_file_reg_info = {
@@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = {
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__task_file, task),
+		  PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__task_file, file),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
 };
 
 static int __init task_iter_init(void)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2a1826c76bb6..a3f2af756fd6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		if (!bpf_iter_prog_supported(prog))
 			return -EINVAL;
-		prog->aux->btf_id_or_null_non0_off = true;
 		ret = btf_distill_func_proto(&env->log, btf, t,
 					     tname, &fmodel);
 		return ret;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a1fcc0ca21af..250ff52c674e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2638,11 +2638,6 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
 }
 
 #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
-struct bpf_iter__ipv6_route {
-	__bpf_md_ptr(struct bpf_iter_meta *, meta);
-	__bpf_md_ptr(struct fib6_info *, rt);
-};
-
 static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
 				    struct bpf_iter_meta *meta,
 				    void *v)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22bf4e36c093..22e56465f14d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6403,6 +6403,11 @@ static const struct bpf_iter_reg ipv6_route_reg_info = {
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct ipv6_route_iter),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__ipv6_route, rt),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
 };
 
 static int __init bpf_iter_register(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 839827227e98..4f2c3b14ddbf 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2809,6 +2809,11 @@ static const struct bpf_iter_reg netlink_reg_info = {
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct nl_seq_iter),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__netlink, sk),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
 };
 
 static int __init bpf_iter_register(void)
-- 
cgit v1.2.3


From 6e8a4f9dda3823274fa8a4c1aa5e6a93f9775749 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 May 2020 13:07:59 +0200
Subject: net: ignore sock_from_file errors in __scm_install_fd

The code had historically been ignoring these errors, and my recent
refactoring changed that, which broke ssh in some setups.

Fixes: 2618d530dd8b ("net/scm: cleanup scm_detach_fds")
Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/scm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/scm.c b/net/core/scm.c
index a75cd637a71f..875df1c2989d 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -307,7 +307,7 @@ static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags)
 		sock_update_classid(&sock->sk->sk_cgrp_data);
 	}
 	fd_install(new_fd, get_file(file));
-	return error;
+	return 0;
 }
 
 static int scm_max_fds(struct msghdr *msg)
-- 
cgit v1.2.3


From 1b2f08df0a886e0565c71821d5230cba395f5c18 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 May 2020 21:36:41 +0200
Subject: ipv6: set msg_control_is_user in do_ipv6_getsockopt

While do_ipv6_getsockopt does not call the high-level recvmsg helper,
the msghdr eventually ends up being passed to put_cmsg anyway, and thus
needs msg_control_is_user set to the proper value.

Fixes: 1f466e1f15cf ("net: cleanly handle kernel vs user buffers for ->msg_control")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 18d05403d3b5..a0e50cc57e54 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1075,6 +1075,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		msg.msg_control = optval;
 		msg.msg_controllen = len;
 		msg.msg_flags = flags;
+		msg.msg_control_is_user = true;
 
 		lock_sock(sk);
 		skb = np->pktoptions;
-- 
cgit v1.2.3


From 5a46b062e28f57bffde767437fad3ab1d0cee2c7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 13 May 2020 10:28:22 -0700
Subject: devlink: refactor end checks in devlink_nl_cmd_region_read_dumpit

Clean up after recent fixes, move address calculations
around and change the variable init, so that we can have
just one start_offset == end_offset check.

Make the check a little stricter to preserve the -EINVAL
error if requested start offset is larger than the region
itself.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c                                 | 41 +++++++++-------------
 .../selftests/drivers/net/netdevsim/devlink.sh     | 15 ++++++++
 2 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 20f935fa29f5..7b76e5fffc10 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4215,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
 						struct nlattr **attrs,
 						u64 start_offset,
 						u64 end_offset,
-						bool dump,
 						u64 *new_offset)
 {
 	struct devlink_snapshot *snapshot;
@@ -4230,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
 	if (!snapshot)
 		return -EINVAL;
 
-	if (end_offset > region->size || dump)
-		end_offset = region->size;
-
 	while (curr_offset < end_offset) {
 		u32 data_size;
 		u8 *data;
@@ -4260,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 					     struct netlink_callback *cb)
 {
 	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
-	u64 ret_offset, start_offset, end_offset = 0;
+	u64 ret_offset, start_offset, end_offset = U64_MAX;
 	struct nlattr **attrs = info->attrs;
 	struct devlink_region *region;
 	struct nlattr *chunks_attr;
 	const char *region_name;
 	struct devlink *devlink;
-	bool dump = true;
 	void *hdr;
 	int err;
 
@@ -4294,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 		goto out_unlock;
 	}
 
+	if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+	    attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+		if (!start_offset)
+			start_offset =
+				nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+		end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+		end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+	}
+
+	if (end_offset > region->size)
+		end_offset = region->size;
+
 	/* return 0 if there is no further data to read */
-	if (start_offset >= region->size) {
+	if (start_offset == end_offset) {
 		err = 0;
 		goto out_unlock;
 	}
@@ -4322,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 		goto nla_put_failure;
 	}
 
-	if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
-	    attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
-		if (!start_offset)
-			start_offset =
-				nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
-
-		end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
-		end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
-		dump = false;
-
-		if (start_offset == end_offset) {
-			err = 0;
-			goto nla_put_failure;
-		}
-	}
-
 	err = devlink_nl_region_read_snapshot_fill(skb, devlink,
 						   region, attrs,
 						   start_offset,
-						   end_offset, dump,
-						   &ret_offset);
+						   end_offset, &ret_offset);
 
 	if (err && err != -EMSGSIZE)
 		goto nla_put_failure;
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index ad539eccddcb..de4b32fc4223 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -146,6 +146,21 @@ regions_test()
 
 	check_region_snapshot_count dummy post-first-request 3
 
+	devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null
+	check_err $? "Failed to dump snapshot with id 25"
+
+	devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null
+	check_err $? "Failed to read snapshot with id 25 (1 byte)"
+
+	devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null
+	check_err $? "Failed to read snapshot with id 25 (128 bytes)"
+
+	devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null
+	check_err $? "Failed to read snapshot with id 25 (oversized)"
+
+	devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1
+	check_fail $? "Bad read of snapshot with id 25 did not fail"
+
 	devlink region del $DL_HANDLE/dummy snapshot 25
 	check_err $? "Failed to delete snapshot with id 25"
 
-- 
cgit v1.2.3


From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 13 May 2020 18:50:27 -0700
Subject: bpf: Support narrow loads from bpf_sock_addr.user_port

bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly
code in BPF programs, like:

	volatile __u32 user_port = ctx->user_port;
	__u16 port = bpf_ntohs(user_port);

Since otherwise clang may optimize the load to be 2-byte and it's
rejected by verifier.

Add support for 1- and 2-byte loads same way as it's supported for other
fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com
---
 include/uapi/linux/bpf.h       |  2 +-
 net/core/filter.c              | 15 +++++++--------
 tools/include/uapi/linux/bpf.h |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bfb31c1be219..85cfdffde182 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3728,7 +3728,7 @@ struct bpf_sock_addr {
 	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_port;	/* Allows 4-byte read and write.
+	__u32 user_port;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order
 				 */
 	__u32 family;		/* Allows 4-byte read, but no write */
diff --git a/net/core/filter.c b/net/core/filter.c
index da0634979f53..1fe8c0c2d408 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
 				msg_src_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock_addr, user_port):
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
 
@@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size,
 				return false;
 		}
 		break;
-	case bpf_ctx_range(struct bpf_sock_addr, user_port):
-		if (size != size_default)
-			return false;
-		break;
 	case offsetof(struct bpf_sock_addr, sk):
 		if (type != BPF_READ)
 			return false;
@@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 					struct bpf_insn *insn_buf,
 					struct bpf_prog *prog, u32 *target_size)
 {
+	int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
 	struct bpf_insn *insn = insn_buf;
-	int off;
 
 	switch (si->off) {
 	case offsetof(struct bpf_sock_addr, user_family):
@@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 			     offsetof(struct sockaddr_in6, sin6_port));
 		BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
 			     sizeof_field(struct sockaddr_in6, sin6_port));
-		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
-						     struct sockaddr_in6, uaddr,
-						     sin6_port, tmp_reg);
+		/* Account for sin6_port being smaller than user_port. */
+		port_size = min(port_size, BPF_LDST_BYTES(si));
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+			sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
 		break;
 
 	case offsetof(struct bpf_sock_addr, family):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bfb31c1be219..85cfdffde182 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3728,7 +3728,7 @@ struct bpf_sock_addr {
 	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_port;	/* Allows 4-byte read and write.
+	__u32 user_port;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order
 				 */
 	__u32 family;		/* Allows 4-byte read, but no write */
-- 
cgit v1.2.3


From d56c2f95adb3d401bf982b6cf8fc4bb6d2f7acdd Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 14 May 2020 13:03:45 -0700
Subject: bpf: Allow sk lookup helpers in cgroup skb

Currently sk lookup helpers are allowed in tc, xdp, sk skb, and cgroup
sock_addr programs.

But they would be useful in cgroup skb as well so that for example
cgroup skb ingress program can lookup a peer socket a packet comes from
on same host and make a decision whether to allow or deny this packet
based on the properties of that socket, e.g. cgroup that peer socket
belongs to.

Allow the following sk lookup helpers in cgroup skb:
* bpf_sk_lookup_tcp;
* bpf_sk_lookup_udp;
* bpf_sk_release;
* bpf_skc_lookup_tcp.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/f8c7ee280f1582b586629436d777b6db00597d63.1589486450.git.rdna@fb.com
---
 net/core/filter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 1fe8c0c2d408..9c3eada5c86c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6159,6 +6159,14 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_cgroup_id_proto;
 #endif
 #ifdef CONFIG_INET
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
+	case BPF_FUNC_skc_lookup_tcp:
+		return &bpf_skc_lookup_tcp_proto;
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
 	case BPF_FUNC_get_listener_sock:
-- 
cgit v1.2.3


From 06d3e4c9f11afc849dc201ecf9ef7a43eeb1dddd Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 14 May 2020 13:03:46 -0700
Subject: bpf: Allow skb_ancestor_cgroup_id helper in cgroup skb

cgroup skb programs already can use bpf_skb_cgroup_id. Allow
bpf_skb_ancestor_cgroup_id as well so that container policies can be
implemented for a container that can have sub-cgroups dynamically
created, but policies should still be implemented based on cgroup id of
container itself not on an id of a sub-cgroup.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/8874194d6041eba190356453ea9f6071edf5f658.1589486450.git.rdna@fb.com
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 9c3eada5c86c..a47dc5b9dad4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6157,6 +6157,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_SOCK_CGROUP_DATA
 	case BPF_FUNC_skb_cgroup_id:
 		return &bpf_skb_cgroup_id_proto;
+	case BPF_FUNC_skb_ancestor_cgroup_id:
+		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
 #ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
-- 
cgit v1.2.3


From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 14 May 2020 13:03:47 -0700
Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers

With having ability to lookup sockets in cgroup skb programs it becomes
useful to access cgroup id of retrieved sockets so that policies can be
implemented based on origin cgroup of such socket.

For example, a container running in a cgroup can have cgroup skb ingress
program that can lookup peer socket that is sending packets to a process
inside the container and decide whether those packets should be allowed
or denied based on cgroup id of the peer.

More specifically such ingress program can implement intra-host policy
"allow incoming packets only from this same container and not from any
other container on same host" w/o relying on source IP addresses since
quite often it can be the case that containers share same IP address on
the host.

Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and
bpf_sk_ancestor_cgroup_id().

These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id
helpers with the only difference that sk is used to get cgroup id
instead of skb, and share code with them.

See documentation in UAPI for more details.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com
---
 include/uapi/linux/bpf.h       | 36 ++++++++++++++++++++++++-
 net/core/filter.c              | 60 +++++++++++++++++++++++++++++++++++-------
 tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++-
 3 files changed, 121 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85cfdffde182..146c742f1d49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3121,6 +3121,38 @@ union bpf_attr {
  * 		0 on success, or a negative error in case of failure:
  *
  *		**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
+ *	Description
+ *		Return the cgroup v2 id of the socket *sk*.
+ *
+ *		*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ *		returned from **bpf_sk_lookup_xxx**\ (),
+ *		**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ *		same as in **bpf_skb_cgroup_id**\ ().
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *sk* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *sk*, then return value will be same as that
+ *		of **bpf_sk_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *sk*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_sk_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3250,7 +3282,9 @@ union bpf_attr {
 	FN(sk_assign),			\
 	FN(ktime_get_boot_ns),		\
 	FN(seq_printf),			\
-	FN(seq_write),
+	FN(seq_write),			\
+	FN(sk_cgroup_id),		\
+	FN(sk_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index a47dc5b9dad4..5815902bb617 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 };
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
+{
+	struct cgroup *cgrp;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgroup_id(cgrp);
+}
+
 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
 {
 	struct sock *sk = skb_to_full_sk(skb);
-	struct cgroup *cgrp;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return cgroup_id(cgrp);
+	return __bpf_sk_cgroup_id(sk);
 }
 
 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
-	   ancestor_level)
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+					      int ancestor_level)
 {
-	struct sock *sk = skb_to_full_sk(skb);
 	struct cgroup *ancestor;
 	struct cgroup *cgrp;
 
-	if (!sk || !sk_fullsock(sk))
-		return 0;
-
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ancestor = cgroup_ancestor(cgrp, ancestor_level);
 	if (!ancestor)
@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
 	return cgroup_id(ancestor);
 }
 
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+	   ancestor_level)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
 	.func           = bpf_skb_ancestor_cgroup_id,
 	.gpl_only       = false,
@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 	.arg2_type      = ARG_ANYTHING,
 };
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+	return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+	.func           = bpf_sk_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_SOCKET,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+	return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+	.func           = bpf_sk_ancestor_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_SOCKET,
+	.arg2_type      = ARG_ANYTHING,
+};
 #endif
 
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_cgroup_id_proto;
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
+	case BPF_FUNC_sk_cgroup_id:
+		return &bpf_sk_cgroup_id_proto;
+	case BPF_FUNC_sk_ancestor_cgroup_id:
+		return &bpf_sk_ancestor_cgroup_id_proto;
 #endif
 #ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85cfdffde182..146c742f1d49 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3121,6 +3121,38 @@ union bpf_attr {
  * 		0 on success, or a negative error in case of failure:
  *
  *		**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
+ *	Description
+ *		Return the cgroup v2 id of the socket *sk*.
+ *
+ *		*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ *		returned from **bpf_sk_lookup_xxx**\ (),
+ *		**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ *		same as in **bpf_skb_cgroup_id**\ ().
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *sk* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *sk*, then return value will be same as that
+ *		of **bpf_sk_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *sk*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_sk_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3250,7 +3282,9 @@ union bpf_attr {
 	FN(sk_assign),			\
 	FN(ktime_get_boot_ns),		\
 	FN(seq_printf),			\
-	FN(seq_write),
+	FN(seq_write),			\
+	FN(sk_cgroup_id),		\
+	FN(sk_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From a075767bbdc659066b89be282c8377fa880e9dc4 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 14 May 2020 12:49:28 +0200
Subject: net: XDP-generic determining XDP frame size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SKB "head" pointer points to the data area that contains
skb_shared_info, that can be found via skb_end_pointer(). Given
xdp->data_hard_start have been established (basically pointing to
skb->head), frame size is between skb_end_pointer() and data_hard_start,
plus the size reserved to skb_shared_info.

Change the bpf_xdp_adjust_tail offset adjust of skb->len, to be a positive
offset number on grow, and negative number on shrink.  As this seems more
natural when reading the code.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/158945336804.97035.7164852191163722056.stgit@firesoul
---
 net/core/dev.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4c91de39890a..f937a3ff668d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp->data_meta = xdp->data;
 	xdp->data_end = xdp->data + hlen;
 	xdp->data_hard_start = skb->data - skb_headroom(skb);
+
+	/* SKB "head" area always have tailroom for skb_shared_info */
+	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
+	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
 	eth = (struct ethhdr *)xdp->data;
@@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		skb_reset_network_header(skb);
 	}
 
-	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
-	 * pckt.
-	 */
-	off = orig_data_end - xdp->data_end;
+	/* check if bpf_xdp_adjust_tail was used */
+	off = xdp->data_end - orig_data_end;
 	if (off != 0) {
 		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
-		skb->len -= off;
-
+		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
 	/* check if XDP changed eth hdr such SKB needs update */
-- 
cgit v1.2.3


From 34cc0b338a61de3eee3a2bfcaf4f9d6e9fae091a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 14 May 2020 12:49:33 +0200
Subject: xdp: Xdp_frame add member frame_sz and handle in convert_to_xdp_frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use hole in struct xdp_frame, when adding member frame_sz, which keeps
same sizeof struct (32 bytes)

Drivers ixgbe and sfc had bug cases where the necessary/expected
tailroom was not reserved. This can lead to some hard to catch memory
corruption issues. Having the drivers frame_sz this can be detected when
packet length/end via xdp->data_end exceed the xdp_data_hard_end
pointer, which accounts for the reserved the tailroom.

When detecting this driver issue, simply fail the conversion with NULL,
which results in feedback to driver (failing xdp_do_redirect()) causing
driver to drop packet. Given the lack of consistent XDP stats, this can
be hard to troubleshoot. And given this is a driver bug, we want to
generate some more noise in form of a WARN stack dump (to ID the driver
code that inlined convert_to_xdp_frame).

Inlining the WARN macro is problematic, because it adds an asm
instruction (on Intel CPUs ud2) what influence instruction cache
prefetching. Thus, introduce xdp_warn and macro XDP_WARN, to avoid this
and at the same time make identifying the function and line of this
inlined function easier.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/158945337313.97035.10015729316710496600.stgit@firesoul
---
 include/net/xdp.h | 14 +++++++++++++-
 net/core/xdp.c    |  8 ++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index a764af4ae0ea..3094fccf5a88 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -89,7 +89,8 @@ struct xdp_frame {
 	void *data;
 	u16 len;
 	u16 headroom;
-	u16 metasize;
+	u32 metasize:8;
+	u32 frame_sz:24;
 	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
 	 * while mem info is valid on remote CPU.
 	 */
@@ -104,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
 	frame->dev_rx = NULL;
 }
 
+/* Avoids inlining WARN macro in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line);
+#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)
+
 struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
 
 /* Convert xdp_buff to xdp_frame */
@@ -124,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
 		return NULL;
 
+	/* Catch if driver didn't reserve tailroom for skb_shared_info */
+	if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
+		XDP_WARN("Driver BUG: missing reserved tailroom");
+		return NULL;
+	}
+
 	/* Store info in top of packet */
 	xdp_frame = xdp->data_hard_start;
 
@@ -131,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	xdp_frame->len  = xdp->data_end - xdp->data;
 	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
 	xdp_frame->metasize = metasize;
+	xdp_frame->frame_sz = xdp->frame_sz;
 
 	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
 	xdp_frame->mem = xdp->rxq->mem;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4c7ea85486af..490b8f5fa8ee 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/rhashtable.h>
+#include <linux/bug.h>
 #include <net/page_pool.h>
 
 #include <net/xdp.h>
@@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
 	return xdpf;
 }
 EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+	WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);
-- 
cgit v1.2.3


From c8741e2bfe872425ea6f10bb6f7dc1d67bc60c3a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 14 May 2020 12:51:25 +0200
Subject: xdp: Allow bpf_xdp_adjust_tail() to grow packet size

Finally, after all drivers have a frame size, allow BPF-helper
bpf_xdp_adjust_tail() to grow or extend packet size at frame tail.

Remember that helper/macro xdp_data_hard_end have reserved some
tailroom.  Thus, this helper makes sure that the BPF-prog don't have
access to this tailroom area.

V2: Remove one chicken check and use WARN_ONCE for other

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/158945348530.97035.12577148209134239291.stgit@firesoul
---
 include/uapi/linux/bpf.h |  4 ++--
 net/core/filter.c        | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 32cbf36c7729..b9b8a0f63b91 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2015,8 +2015,8 @@ union bpf_attr {
  * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
- * 		only possible to shrink the packet as of this writing,
- * 		therefore *delta* must be a negative integer.
+ * 		possible to both shrink and grow the packet tail.
+ * 		Shrink done via *delta* being a negative integer.
  *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
diff --git a/net/core/filter.c b/net/core/filter.c
index 5815902bb617..e7b033dad44e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3411,12 +3411,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 
 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
 {
+	void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
 	void *data_end = xdp->data_end + offset;
 
-	/* only shrinking is allowed for now. */
-	if (unlikely(offset >= 0))
+	/* Notice that xdp_data_hard_end have reserved some tailroom */
+	if (unlikely(data_end > data_hard_end))
 		return -EINVAL;
 
+	/* ALL drivers MUST init xdp->frame_sz, chicken check below */
+	if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
+		WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
+		return -EINVAL;
+	}
+
 	if (unlikely(data_end < xdp->data + ETH_HLEN))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From ddb47d518ca10948d1f64a983cb9274720f691cd Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 14 May 2020 12:51:30 +0200
Subject: xdp: Clear grow memory in bpf_xdp_adjust_tail()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clearing memory of tail when grow happens, because it is too easy
to write a XDP_PASS program that extend the tail, which expose
this memory to users that can run tcpdump.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/158945349039.97035.5262100484553494.stgit@firesoul
---
 net/core/filter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index e7b033dad44e..a85eb538d4d6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3427,6 +3427,10 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
 	if (unlikely(data_end < xdp->data + ETH_HLEN))
 		return -EINVAL;
 
+	/* Clear memory area on grow, can contain uninit kernel memory */
+	if (offset > 0)
+		memset(xdp->data_end, 0, offset);
+
 	xdp->data_end = data_end;
 
 	return 0;
-- 
cgit v1.2.3


From bc56c919fce782f616823b76fb70a788f4762cf5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 14 May 2020 12:51:35 +0200
Subject: bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp().

Update the memory requirements, when adding xdp.frame_sz in BPF test_run
function bpf_prog_test_run_xdp() which e.g. is used by XDP selftests.

Specifically add the expected reserved tailroom, but also allocated a
larger memory area to reflect that XDP frames usually comes in this
format. Limit the provided packet data size to 4096 minus headroom +
tailroom, as this also reflect a common 3520 bytes MTU limit with XDP.

Note that bpf_test_init already use a memory allocation method that clears
memory.  Thus, this already guards against leaking uninit kernel memory.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/158945349549.97035.15316291762482444006.stgit@firesoul
---
 net/bpf/test_run.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 29dbdd4c29f6..30ba7d38941d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -470,25 +470,34 @@ out:
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
+	u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	u32 headroom = XDP_PACKET_HEADROOM;
 	u32 size = kattr->test.data_size_in;
 	u32 repeat = kattr->test.repeat;
 	struct netdev_rx_queue *rxqueue;
 	struct xdp_buff xdp = {};
 	u32 retval, duration;
+	u32 max_data_sz;
 	void *data;
 	int ret;
 
 	if (kattr->test.ctx_in || kattr->test.ctx_out)
 		return -EINVAL;
 
-	data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0);
+	/* XDP have extra tailroom as (most) drivers use full page */
+	max_data_sz = 4096 - headroom - tailroom;
+	if (size > max_data_sz)
+		return -EINVAL;
+
+	data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
 	if (IS_ERR(data))
 		return PTR_ERR(data);
 
 	xdp.data_hard_start = data;
-	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data = data + headroom;
 	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
+	xdp.frame_sz = headroom + max_data_sz + tailroom;
 
 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp.rxq = &rxqueue->xdp_rxq;
@@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	if (ret)
 		goto out;
-	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
-	    xdp.data_end != xdp.data + size)
+	if (xdp.data != data + headroom || xdp.data_end != xdp.data + size)
 		size = xdp.data_end - xdp.data;
 	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
 out:
-- 
cgit v1.2.3


From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 13 May 2020 16:03:54 -0700
Subject: bpf: Implement CAP_BPF

Implement permissions as stated in uapi/linux/capability.h
In order to do that the verifier allow_ptr_leaks flag is split
into four flags and they are set as:
  env->allow_ptr_leaks = bpf_allow_ptr_leaks();
  env->bypass_spec_v1 = bpf_bypass_spec_v1();
  env->bypass_spec_v4 = bpf_bypass_spec_v4();
  env->bpf_capable = bpf_capable();

The first three currently equivalent to perfmon_capable(), since leaking kernel
pointers and reading kernel memory via side channel attacks is roughly
equivalent to reading kernel memory with cap_perfmon.

'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and
other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions,
subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the
verifier, run time mitigations in bpf array, and enables indirect variable
access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code
by the verifier.

That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN
will have speculative checks done by the verifier and other spectre mitigation
applied. Such networking BPF program will not be able to leak kernel pointers
and will not be able to access arbitrary kernel memory.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com
---
 drivers/media/rc/bpf-lirc.c   |  2 +-
 include/linux/bpf.h           | 18 ++++++++-
 include/linux/bpf_verifier.h  |  3 ++
 kernel/bpf/arraymap.c         | 10 ++---
 kernel/bpf/bpf_struct_ops.c   |  2 +-
 kernel/bpf/core.c             |  2 +-
 kernel/bpf/cpumap.c           |  2 +-
 kernel/bpf/hashtab.c          |  4 +-
 kernel/bpf/helpers.c          |  4 +-
 kernel/bpf/lpm_trie.c         |  2 +-
 kernel/bpf/map_in_map.c       |  2 +-
 kernel/bpf/queue_stack_maps.c |  2 +-
 kernel/bpf/reuseport_array.c  |  2 +-
 kernel/bpf/stackmap.c         |  2 +-
 kernel/bpf/syscall.c          | 89 +++++++++++++++++++++++++++++++++----------
 kernel/bpf/verifier.c         | 37 +++++++++---------
 kernel/trace/bpf_trace.c      |  3 ++
 net/core/bpf_sk_storage.c     |  4 +-
 net/core/filter.c             |  4 +-
 19 files changed, 134 insertions(+), 60 deletions(-)

(limited to 'net')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 069c42f22a8c..5bb144435c16 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
+		if (perfmon_capable())
 			return bpf_get_trace_printk_proto();
 		/* fall through */
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c45d198ac38c..efe8836b5c48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/capability.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -119,7 +120,7 @@ struct bpf_map {
 	struct bpf_map_memory memory;
 	char name[BPF_OBJ_NAME_LEN];
 	u32 btf_vmlinux_value_type_id;
-	bool unpriv_array;
+	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 	/* 22 bytes hole */
 
@@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
+static inline bool bpf_allow_ptr_leaks(void)
+{
+	return perfmon_capable();
+}
+
+static inline bool bpf_bypass_spec_v1(void)
+{
+	return perfmon_capable();
+}
+
+static inline bool bpf_bypass_spec_v4(void)
+{
+	return perfmon_capable();
+}
+
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6abd5a778fcd..ea833087e853 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -375,6 +375,9 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool bpf_capable;
+	bool bypass_spec_v1;
+	bool bypass_spec_v4;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 95d77770353c..1d5bb0d983b2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
+	bool bypass_spec_v1 = bpf_bypass_spec_v1();
 	u64 cost, array_size, mask64;
 	struct bpf_map_memory mem;
 	struct bpf_array *array;
@@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	mask64 -= 1;
 
 	index_mask = mask64;
-	if (unpriv) {
+	if (!bypass_spec_v1) {
 		/* round up array size to nearest power of 2,
 		 * since cpu will speculate within index_mask limits
 		 */
@@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 	}
 	array->index_mask = index_mask;
-	array->map.unpriv_array = unpriv;
+	array->map.bypass_spec_v1 = bypass_spec_v1;
 
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&array->map, attr);
@@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	if (map->unpriv_array) {
+	if (!map->bypass_spec_v1) {
 		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
 		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
 	} else {
@@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	if (map->unpriv_array) {
+	if (!map->bypass_spec_v1) {
 		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
 		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
 	} else {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 26cb51f2db72..c6b0decaa46a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	struct bpf_map *map;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6aa11de67315..c40ff4cf9880 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !capable(CAP_SYS_ADMIN))
+	    !bpf_capable())
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a71790dab12d..8b85bfddfac7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	u64 cost;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d541c8486c95..b4b288a3c3c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
 
-	if (lru && !capable(CAP_SYS_ADMIN))
+	if (lru && !bpf_capable())
 		/* LRU implementation is much complicated than other
-		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 * maps.  Hence, limit to CAP_BPF.
 		 */
 		return -EPERM;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5c0290e0696e..886949fdcece 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return NULL;
 
 	switch (func_id) {
@@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
+		if (!perfmon_capable())
+			return NULL;
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_jiffies64:
 		return &bpf_jiffies64_proto;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 65c236cf341e..c8cc4e4cf98d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index b3c48d1533cb..17738c93bec8 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	/* Misc members not needed in bpf_map_meta_equal() check. */
 	inner_map_meta->ops = inner_map->ops;
 	if (inner_map->ops == &array_map_ops) {
-		inner_map_meta->unpriv_array = inner_map->unpriv_array;
+		inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
 		container_of(inner_map_meta, struct bpf_array, map)->index_mask =
 		     container_of(inner_map, struct bpf_array, map)->index_mask;
 	}
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 30e1373fd437..05c8e043b9d2 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
 /* Called from syscall */
 static int queue_stack_map_alloc_check(union bpf_attr *attr)
 {
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return -EPERM;
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 01badd3eda7a..21cde24386db 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	u64 array_size;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index db76339fe358..7b8381ce40a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u64 cost, n_buckets;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index de2a75500233..79bcd8d056d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr)
 		err = -EBUSY;
 		goto err_put;
 	}
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!bpf_capable()) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 	}
 }
 
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+	switch (prog_type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+	case BPF_PROG_TYPE_SK_SKB:
+	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_LIRC_MODE2:
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_EXT: /* extends any prog */
+		return true;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		/* always unpriv */
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		/* equivalent to SOCKET_FILTER. need CAP_BPF only */
+	default:
+		return false;
+	}
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+	switch (prog_type) {
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_LSM:
+	case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+	case BPF_PROG_TYPE_EXT: /* extends any prog */
+		return true;
+	default:
+		return false;
+	}
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
 
@@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !bpf_capable())
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
@@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	is_gpl = license_is_gpl_compatible(license);
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !capable(CAP_SYS_ADMIN))
+	    !bpf_capable())
+		return -EPERM;
+
+	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (is_perfmon_prog_type(type) && !perfmon_capable())
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
@@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
+		if (!capable(CAP_NET_ADMIN))
+			/* cg-skb progs can be loaded by unpriv user.
+			 * check permissions at attach time.
+			 */
+			return -EPERM;
 		return prog->enforce_expected_attach_type &&
 			prog->expected_attach_type != attach_type ?
 			-EINVAL : 0;
@@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	struct bpf_prog *prog;
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
@@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (CHECK_ATTR(BPF_PROG_DETACH))
 		return -EINVAL;
 
@@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_LIRC_MODE2:
 		return lirc_prog_detach(attr);
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		return skb_flow_dissector_bpf_prog_detach(attr);
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
@@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!bpf_capable()) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
@@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return -EPERM;
 
 	return btf_new_fd(attr);
@@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr)
 	struct bpf_prog *prog;
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (CHECK_ATTR(BPF_LINK_CREATE))
 		return -EINVAL;
 
@@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr)
 	u32 flags;
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (CHECK_ATTR(BPF_LINK_UPDATE))
 		return -EINVAL;
 
@@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr;
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a3f2af756fd6..180933f6fba9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 	reg->type = SCALAR_VALUE;
 	reg->var_off = tnum_unknown;
 	reg->frameno = 0;
-	reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks;
+	reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
 	__mark_reg_unbounded(reg);
 }
 
@@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			continue;
 		if (insn[i].src_reg != BPF_PSEUDO_CALL)
 			continue;
-		if (!env->allow_ptr_leaks) {
-			verbose(env, "function calls to other bpf functions are allowed for root only\n");
+		if (!env->bpf_capable) {
+			verbose(env,
+				"function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
 			return -EPERM;
 		}
 		ret = add_subprog(env, i + insn[i].imm + 1);
@@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
 	bool new_marks = false;
 	int i, err;
 
-	if (!env->allow_ptr_leaks)
-		/* backtracking is root only for now */
+	if (!env->bpf_capable)
 		return 0;
 
 	func = st->frame[st->curframe];
@@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		reg = &cur->regs[value_regno];
 
 	if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
-	    !register_is_null(reg) && env->allow_ptr_leaks) {
+	    !register_is_null(reg) && env->bpf_capable) {
 		if (dst_reg != BPF_REG_FP) {
 			/* The backtracking logic can only recognize explicit
 			 * stack slot address like [fp - 8]. Other spill of
@@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		if (!env->allow_ptr_leaks) {
+		if (!env->bypass_spec_v4) {
 			bool sanitize = false;
 
 			if (state->stack[spi].slot_type[0] == STACK_SPILL &&
@@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		 * Spectre masking for stack ALU.
 		 * See also retrieve_ptr_limit().
 		 */
-		if (!env->allow_ptr_leaks) {
+		if (!env->bypass_spec_v1) {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 
 	if (!BPF_MAP_PTR(aux->map_ptr_state))
 		bpf_map_ptr_store(aux, meta->map_ptr,
-				  meta->map_ptr->unpriv_array);
+				  !meta->map_ptr->bypass_spec_v1);
 	else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
 		bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
-				  meta->map_ptr->unpriv_array);
+				  !meta->map_ptr->bypass_spec_v1);
 	return 0;
 }
 
@@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
 				    const struct bpf_insn *insn)
 {
-	return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+	return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
 }
 
 static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	/* For unprivileged we require that resulting offset must be in bounds
 	 * in order to be able to sanitize access later on.
 	 */
-	if (!env->allow_ptr_leaks) {
+	if (!env->bypass_spec_v1) {
 		if (dst_reg->type == PTR_TO_MAP_VALUE &&
 		    check_map_access(env, dst, dst_reg->off, 1, false)) {
 			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
@@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
 		insn_stack[env->cfg.cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		if (loop_ok && env->allow_ptr_leaks)
+		if (loop_ok && env->bpf_capable)
 			return 0;
 		verbose_linfo(env, t, "%d: ", t);
 		verbose_linfo(env, w, "%d: ", w);
@@ -8353,7 +8353,7 @@ next:
 	if (env->max_states_per_insn < states_cnt)
 		env->max_states_per_insn = states_cnt;
 
-	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+	if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
 		return push_jmp_history(env, cur);
 
 	if (!add_new_state)
@@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
 			aux = &env->insn_aux_data[i + delta];
-			if (env->allow_ptr_leaks && !expect_blinding &&
+			if (env->bpf_capable && !expect_blinding &&
 			    prog->jit_requested &&
 			    !bpf_map_key_poisoned(aux) &&
 			    !bpf_map_ptr_poisoned(aux) &&
@@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
-	is_priv = capable(CAP_SYS_ADMIN);
+	is_priv = bpf_capable();
 
 	if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
 		mutex_lock(&bpf_verifier_lock);
@@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
-	env->allow_ptr_leaks = is_priv;
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+	env->bypass_spec_v1 = bpf_bypass_spec_v1();
+	env->bypass_spec_v4 = bpf_bypass_spec_v4();
+	env->bpf_capable = bpf_capable();
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d961428fb5b6..9a84d7fb4869 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
 
 static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
 {
+	if (!capable(CAP_SYS_ADMIN))
+		return NULL;
+
 	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
 			    current->comm, task_pid_nr(current));
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 756b63b6f7b3..d2c4d16dadba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return -EPERM;
 
 	if (attr->value_size > MAX_VALUE_SIZE)
@@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
 	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
 	 * the map_alloc_check() side also does.
 	 */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
 	nla_for_each_nested(nla, nla_stgs, rem) {
diff --git a/net/core/filter.c b/net/core/filter.c
index a85eb538d4d6..f8a3c7e9d027 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!capable(CAP_SYS_ADMIN))
+		if (!bpf_capable())
 			return false;
 		break;
 	}
@@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!capable(CAP_SYS_ADMIN))
+			if (!bpf_capable())
 				return false;
 			break;
 		default:
-- 
cgit v1.2.3


From 2e186a2cf8c785f38ef3237e83f8921f82f6e2b7 Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Fri, 15 May 2020 11:52:52 +0200
Subject: net: core: recursively find netdev by device node

The assumption that a device node is associated either with the
netdev's device, or the parent of that device, does not hold for all
drivers. E.g. Freescale's DPAA has two layers of platform devices
above the netdev. Instead, recursively walk up the tree from the
netdev, allowing any parent to match against the sought after node.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 880e89c894f6..e353b822bb15 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1805,12 +1805,12 @@ static struct class net_class __ro_after_init = {
 #ifdef CONFIG_OF_NET
 static int of_dev_node_match(struct device *dev, const void *data)
 {
-	int ret = 0;
-
-	if (dev->parent)
-		ret = dev->parent->of_node == data;
+	for (; dev; dev = dev->parent) {
+		if (dev->of_node == data)
+			return 1;
+	}
 
-	return ret == 0 ? dev->of_node == data : ret;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From f8ab1807a9c9aa14478920e64d1c9d3685aae26f Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Fri, 15 May 2020 14:40:11 +0300
Subject: net: sched: introduce terse dump flag

Add new TCA_DUMP_FLAGS attribute and use it in cls API to request terse
filter output from classifiers with TCA_DUMP_FLAGS_TERSE flag. This option
is intended to be used to improve performance of TC filter dump when
userland only needs to obtain stats and not the whole classifier/action
data. Extend struct tcf_proto_ops with new terse_dump() callback that must
be defined by supporting classifier implementations.

Support of the options in specific classifiers and actions is
implemented in following patches in the series.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      |  4 ++++
 include/uapi/linux/rtnetlink.h |  6 ++++++
 net/sched/cls_api.c            | 39 +++++++++++++++++++++++++++++++--------
 3 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ab87a8b86a32..c510b03b9751 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -330,6 +330,10 @@ struct tcf_proto_ops {
 	int			(*dump)(struct net*, struct tcf_proto*, void *,
 					struct sk_buff *skb, struct tcmsg*,
 					bool);
+	int			(*terse_dump)(struct net *net,
+					      struct tcf_proto *tp, void *fh,
+					      struct sk_buff *skb,
+					      struct tcmsg *t, bool rtnl_held);
 	int			(*tmplt_dump)(struct sk_buff *skb,
 					      struct net *net,
 					      void *tmplt_priv);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 4a8c5b745157..073e71ef6bdd 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -609,11 +609,17 @@ enum {
 	TCA_HW_OFFLOAD,
 	TCA_INGRESS_BLOCK,
 	TCA_EGRESS_BLOCK,
+	TCA_DUMP_FLAGS,
 	__TCA_MAX
 };
 
 #define TCA_MAX (__TCA_MAX - 1)
 
+#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic
+				       * data necessary to identify the objects
+				       * (handle, cookie, etc.) and stats.
+				       */
+
 #define TCA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
 #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 299b963c796e..cb2c10e0fee5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1851,7 +1851,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 			 struct tcf_proto *tp, struct tcf_block *block,
 			 struct Qdisc *q, u32 parent, void *fh,
 			 u32 portid, u32 seq, u16 flags, int event,
-			 bool rtnl_held)
+			 bool terse_dump, bool rtnl_held)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -1878,6 +1878,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 		goto nla_put_failure;
 	if (!fh) {
 		tcm->tcm_handle = 0;
+	} else if (terse_dump) {
+		if (tp->ops->terse_dump) {
+			if (tp->ops->terse_dump(net, tp, fh, skb, tcm,
+						rtnl_held) < 0)
+				goto nla_put_failure;
+		} else {
+			goto cls_op_not_supp;
+		}
 	} else {
 		if (tp->ops->dump &&
 		    tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
@@ -1888,6 +1896,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
 
 out_nlmsg_trim:
 nla_put_failure:
+cls_op_not_supp:
 	nlmsg_trim(skb, b);
 	return -1;
 }
@@ -1908,7 +1917,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 
 	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
 			  n->nlmsg_seq, n->nlmsg_flags, event,
-			  rtnl_held) <= 0) {
+			  false, rtnl_held) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -1940,7 +1949,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 
 	if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
 			  n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
-			  rtnl_held) <= 0) {
+			  false, rtnl_held) <= 0) {
 		NL_SET_ERR_MSG(extack, "Failed to build del event notification");
 		kfree_skb(skb);
 		return -EINVAL;
@@ -2501,6 +2510,7 @@ struct tcf_dump_args {
 	struct tcf_block *block;
 	struct Qdisc *q;
 	u32 parent;
+	bool terse_dump;
 };
 
 static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
@@ -2511,12 +2521,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 	return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
 			     n, NETLINK_CB(a->cb->skb).portid,
 			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
-			     RTM_NEWTFILTER, true);
+			     RTM_NEWTFILTER, a->terse_dump, true);
 }
 
 static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			   struct sk_buff *skb, struct netlink_callback *cb,
-			   long index_start, long *p_index)
+			   long index_start, long *p_index, bool terse)
 {
 	struct net *net = sock_net(skb->sk);
 	struct tcf_block *block = chain->block;
@@ -2545,7 +2555,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					  RTM_NEWTFILTER, true) <= 0)
+					  RTM_NEWTFILTER, false, true) <= 0)
 				goto errout;
 			cb->args[1] = 1;
 		}
@@ -2561,6 +2571,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 		arg.w.skip = cb->args[1] - 1;
 		arg.w.count = 0;
 		arg.w.cookie = cb->args[2];
+		arg.terse_dump = terse;
 		tp->ops->walk(tp, &arg.w, true);
 		cb->args[2] = arg.w.cookie;
 		cb->args[1] = arg.w.count + 1;
@@ -2574,6 +2585,10 @@ errout:
 	return false;
 }
 
+static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = {
+	[TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE),
+};
+
 /* called with RTNL */
 static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -2583,6 +2598,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	struct Qdisc *q = NULL;
 	struct tcf_block *block;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
+	bool terse_dump = false;
 	long index_start;
 	long index;
 	u32 parent;
@@ -2592,10 +2608,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		return skb->len;
 
 	err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
-				     NULL, cb->extack);
+				     tcf_tfilter_dump_policy, cb->extack);
 	if (err)
 		return err;
 
+	if (tca[TCA_DUMP_FLAGS]) {
+		struct nla_bitfield32 flags =
+			nla_get_bitfield32(tca[TCA_DUMP_FLAGS]);
+
+		terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE;
+	}
+
 	if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
 		block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
 		if (!block)
@@ -2653,7 +2676,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		    nla_get_u32(tca[TCA_CHAIN]) != chain->index)
 			continue;
 		if (!tcf_chain_dump(chain, q, parent, skb, cb,
-				    index_start, &index)) {
+				    index_start, &index, terse_dump)) {
 			tcf_chain_put(chain);
 			err = -EMSGSIZE;
 			break;
-- 
cgit v1.2.3


From ca44b738e59420ae73d9e04a1be630a405e3a0f1 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Fri, 15 May 2020 14:40:12 +0300
Subject: net: sched: implement terse dump support in act

Extend tcf_action_dump() with boolean argument 'terse' that is used to
request terse-mode action dump. In terse mode only essential data needed to
identify particular action (action kind, cookie, etc.) and its stats is put
to resulting skb and everything else is omitted. Implement
tcf_exts_terse_dump() helper in cls API that is intended to be used to
request terse dump of all exts (actions) attached to the filter.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  2 +-
 include/net/pkt_cls.h |  1 +
 net/sched/act_api.c   | 30 +++++++++++++++++++++++-------
 net/sched/cls_api.c   | 28 +++++++++++++++++++++++++++-
 4 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c24d7643548e..1b4bfc4437be 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -193,7 +193,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    bool rtnl_held,
 				    struct netlink_ext_ack *extack);
 int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
-		    int ref);
+		    int ref, bool terse);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 04aa0649f3b0..ed65619cbc47 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -325,6 +325,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 void tcf_exts_destroy(struct tcf_exts *exts);
 void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
+int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
 
 /**
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index fbbec2e562f5..8ac7eb0a8309 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -766,12 +766,10 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	return a->ops->dump(skb, a, bind, ref);
 }
 
-int
-tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int
+tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a)
 {
-	int err = -EINVAL;
 	unsigned char *b = skb_tail_pointer(skb);
-	struct nlattr *nest;
 	struct tc_cookie *cookie;
 
 	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
@@ -789,6 +787,23 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	}
 	rcu_read_unlock();
 
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	int err = -EINVAL;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	if (tcf_action_dump_terse(skb, a))
+		goto nla_put_failure;
+
 	if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
 	    nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
 			       a->hw_stats, TCA_ACT_HW_STATS_ANY))
@@ -820,7 +835,7 @@ nla_put_failure:
 EXPORT_SYMBOL(tcf_action_dump_1);
 
 int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
-		    int bind, int ref)
+		    int bind, int ref, bool terse)
 {
 	struct tc_action *a;
 	int err = -EINVAL, i;
@@ -831,7 +846,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
 		nest = nla_nest_start_noflag(skb, i + 1);
 		if (nest == NULL)
 			goto nla_put_failure;
-		err = tcf_action_dump_1(skb, a, bind, ref);
+		err = terse ? tcf_action_dump_terse(skb, a) :
+			tcf_action_dump_1(skb, a, bind, ref);
 		if (err < 0)
 			goto errout;
 		nla_nest_end(skb, nest);
@@ -1133,7 +1149,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
 	if (!nest)
 		goto out_nlmsg_trim;
 
-	if (tcf_action_dump(skb, actions, bind, ref) < 0)
+	if (tcf_action_dump(skb, actions, bind, ref, false) < 0)
 		goto out_nlmsg_trim;
 
 	nla_nest_end(skb, nest);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cb2c10e0fee5..752d608f4442 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3179,7 +3179,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
 			if (nest == NULL)
 				goto nla_put_failure;
 
-			if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
+			if (tcf_action_dump(skb, exts->actions, 0, 0, false)
+			    < 0)
 				goto nla_put_failure;
 			nla_nest_end(skb, nest);
 		} else if (exts->police) {
@@ -3203,6 +3204,31 @@ nla_put_failure:
 }
 EXPORT_SYMBOL(tcf_exts_dump);
 
+int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct nlattr *nest;
+
+	if (!exts->action || !tcf_exts_has_actions(exts))
+		return 0;
+
+	nest = nla_nest_start_noflag(skb, exts->action);
+	if (!nest)
+		goto nla_put_failure;
+
+	if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+#else
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_terse_dump);
 
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 {
-- 
cgit v1.2.3


From 0348451db9fffebd01d7b496e108dd729c2fcb24 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Fri, 15 May 2020 14:40:13 +0300
Subject: net: sched: cls_flower: implement terse dump support

Implement tcf_proto_ops->terse_dump() callback for flower classifier. Only
dump handle, flags and action data in terse mode.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 74a0febcafb8..0c574700da75 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -2768,6 +2768,48 @@ nla_put_failure:
 	return -1;
 }
 
+static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh,
+			 struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
+{
+	struct cls_fl_filter *f = fh;
+	struct nlattr *nest;
+	bool skip_hw;
+
+	if (!f)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	spin_lock(&tp->lock);
+
+	skip_hw = tc_skip_hw(f->flags);
+
+	if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+		goto nla_put_failure_locked;
+
+	spin_unlock(&tp->lock);
+
+	if (!skip_hw)
+		fl_hw_update_stats(tp, f, rtnl_held);
+
+	if (tcf_exts_terse_dump(skb, &f->exts))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	return skb->len;
+
+nla_put_failure_locked:
+	spin_unlock(&tp->lock);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
 static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
 {
 	struct fl_flow_tmplt *tmplt = tmplt_priv;
@@ -2832,6 +2874,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
 	.hw_add		= fl_hw_add,
 	.hw_del		= fl_hw_del,
 	.dump		= fl_dump,
+	.terse_dump	= fl_terse_dump,
 	.bind_class	= fl_bind_class,
 	.tmplt_create	= fl_tmplt_create,
 	.tmplt_destroy	= fl_tmplt_destroy,
-- 
cgit v1.2.3


From 90bf45134d55d626ae2713cac50cda10c6c8b0c2 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 15 May 2020 19:22:15 +0200
Subject: mptcp: add new sock flag to deal with join subflows

MP_JOIN subflows must not land into the accept queue.
Currently tcp_check_req() calls an mptcp specific helper
to detect such scenario.

Such helper leverages the subflow context to check for
MP_JOIN subflows. We need to deal also with MP JOIN
failures, even when the subflow context is not available
due allocation failure.

A possible solution would be changing the syn_recv_sock()
signature to allow returning a more descriptive action/
error code and deal with that in tcp_check_req().

Since the above need is MPTCP specific, this patch instead
uses a TCP request socket hole to add a MPTCP specific flag.
Such flag is used by the MPTCP syn_recv_sock() to tell
tcp_check_req() how to deal with the request socket.

This change is a no-op for !MPTCP build, and makes the
MPTCP code simpler. It allows also the next patch to deal
correctly with MP JOIN failure.

v1 -> v2:
 - be more conservative on drop_req initialization (Mat)

RFC -> v1:
 - move the drop_req bit inside tcp_request_sock (Eric)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 +++
 include/net/mptcp.h      | 17 ++++++++++-------
 net/ipv4/tcp_minisocks.c |  2 +-
 net/mptcp/protocol.c     |  7 -------
 net/mptcp/subflow.c      |  3 +++
 5 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e60db06ec28d..bf44e85d709d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -120,6 +120,9 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
+#if IS_ENABLED(CONFIG_MPTCP)
+	bool				drop_req;
+#endif
 	u32				txhash;
 	u32				rcv_isn;
 	u32				snt_isn;
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index e60275659de6..c4a6ef4ba35b 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -68,6 +68,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req)
 	return tcp_rsk(req)->is_mptcp;
 }
 
+static inline bool rsk_drop_req(const struct request_sock *req)
+{
+	return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
+}
+
 void mptcp_space(const struct sock *ssk, int *space, int *full_space);
 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
 		       unsigned int *size, struct mptcp_out_options *opts);
@@ -121,8 +126,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 				 skb_ext_find(from, SKB_EXT_MPTCP));
 }
 
-bool mptcp_sk_is_subflow(const struct sock *sk);
-
 void mptcp_seq_show(struct seq_file *seq);
 #else
 
@@ -140,6 +143,11 @@ static inline bool rsk_is_mptcp(const struct request_sock *req)
 	return false;
 }
 
+static inline bool rsk_drop_req(const struct request_sock *req)
+{
+	return false;
+}
+
 static inline void mptcp_parse_option(const struct sk_buff *skb,
 				      const unsigned char *ptr, int opsize,
 				      struct tcp_options_received *opt_rx)
@@ -190,11 +198,6 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 	return true;
 }
 
-static inline bool mptcp_sk_is_subflow(const struct sock *sk)
-{
-	return false;
-}
-
 static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
 static inline void mptcp_seq_show(struct seq_file *seq) { }
 #endif /* CONFIG_MPTCP */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7e40322cc5ec..495dda2449fe 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,7 +774,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!child)
 		goto listen_overflow;
 
-	if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
+	if (own_req && rsk_drop_req(req)) {
 		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
 		inet_csk_reqsk_queue_drop_and_put(sk, req);
 		return child;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e1f23016ed3f..a61e60e94137 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1638,13 +1638,6 @@ bool mptcp_finish_join(struct sock *sk)
 	return ret;
 }
 
-bool mptcp_sk_is_subflow(const struct sock *sk)
-{
-	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-
-	return subflow->mp_join == 1;
-}
-
 static bool mptcp_memory_free(const struct sock *sk, int wake)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 009d5c478062..5e03ed8ae899 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -470,6 +470,8 @@ create_child:
 	if (child && *own_req) {
 		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
 
+		tcp_rsk(req)->drop_req = false;
+
 		/* we need to fallback on ctx allocation failure and on pre-reqs
 		 * checking above. In the latter scenario we additionally need
 		 * to reset the context to non MPTCP status.
@@ -512,6 +514,7 @@ create_child:
 				goto close_child;
 
 			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
+			tcp_rsk(req)->drop_req = true;
 		}
 	}
 
-- 
cgit v1.2.3


From 2f8a397d0a54b59c05e481523ab2a88a63d82d18 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 15 May 2020 19:22:16 +0200
Subject: inet_connection_sock: factor out destroy helper.

Move the steps to prepare an inet_connection_sock for
forced disposal inside a separate helper. No functional
changes inteded, this will just simplify the next patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 8 ++++++++
 net/ipv4/inet_connection_sock.c    | 6 +-----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index a3f076befa4f..2f1f8c3efb26 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -287,6 +287,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
 
+static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+	/* The below has to be done to allow calling inet_csk_destroy_sock */
+	sock_set_flag(sk, SOCK_DEAD);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+	inet_sk(sk)->inet_num = 0;
+}
+
 void inet_csk_destroy_sock(struct sock *sk);
 void inet_csk_prepare_forced_close(struct sock *sk);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5f34eb951627..d6faf3702824 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -896,11 +896,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 	/* sk_clone_lock locked the socket and set refcnt to 2 */
 	bh_unlock_sock(sk);
 	sock_put(sk);
-
-	/* The below has to be done to allow calling inet_csk_destroy_sock */
-	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
-	inet_sk(sk)->inet_num = 0;
+	inet_csk_prepare_for_destroy_sock(sk);
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
 
-- 
cgit v1.2.3


From 729cd6436f359b6e618c2f14836d419f40444503 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 15 May 2020 19:22:17 +0200
Subject: mptcp: cope better with MP_JOIN failure

Currently, on MP_JOIN failure we reset the child
socket, but leave the request socket untouched.

tcp_check_req will deal with it according to the
'tcp_abort_on_overflow' sysctl value - by default the
req socket will stay alive.

The above leads to inconsistent behavior on MP JOIN
failure, and bad listener overflow accounting.

This patch addresses the issue leveraging the infrastructure
just introduced to ask the TCP stack to drop the req on
failure.

The child socket is not freed anymore by subflow_syn_recv_sock(),
instead it's moved to a dead state and will be disposed by the
next sock_put done by the TCP stack, so that listener overflow
accounting is not affected by MP JOIN failure.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/subflow.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 5e03ed8ae899..3cf2eeea9d80 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -478,7 +478,7 @@ create_child:
 		 */
 		if (!ctx || fallback) {
 			if (fallback_is_fatal)
-				goto close_child;
+				goto dispose_child;
 
 			if (ctx) {
 				subflow_ulp_fallback(child, ctx);
@@ -507,11 +507,11 @@ create_child:
 
 			owner = mptcp_token_get_sock(ctx->token);
 			if (!owner)
-				goto close_child;
+				goto dispose_child;
 
 			ctx->conn = (struct sock *)owner;
 			if (!mptcp_finish_join(child))
-				goto close_child;
+				goto dispose_child;
 
 			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
 			tcp_rsk(req)->drop_req = true;
@@ -531,11 +531,14 @@ out:
 		      !mptcp_subflow_ctx(child)->conn));
 	return child;
 
-close_child:
+dispose_child:
+	tcp_rsk(req)->drop_req = true;
 	tcp_send_active_reset(child, GFP_ATOMIC);
-	inet_csk_prepare_forced_close(child);
+	inet_csk_prepare_for_destroy_sock(child);
 	tcp_done(child);
-	return NULL;
+
+	/* The last child reference will be released by the caller */
+	return child;
 }
 
 static struct inet_connection_sock_af_ops subflow_specific;
-- 
cgit v1.2.3


From 9efd6a3cecdde984d67e63d17fe6af53c7c50968 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 13 May 2020 15:58:43 +0200
Subject: netns: enable to inherit devconf from current netns

The goal is to be able to inherit the initial devconf parameters from the
current netns, ie the netns where this new netns has been created.

This is useful in a containers environment where /proc/sys is read only.
For example, if a pod is created with specifics devconf parameters and has
the capability to create netns, the user expects to get the same parameters
than his 'init_net', which is not the real init_net in this case.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/sysctl/net.rst |  4 +++-
 net/core/sysctl_net_core.c               |  4 +++-
 net/ipv4/devinet.c                       | 23 ++++++++++++++++++-----
 net/ipv6/addrconf.c                      | 23 ++++++++++++++++++++---
 4 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 2ad1b77a7182..42cd04bca548 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -339,7 +339,9 @@ settings from init_net and for IPv6 we reset all settings to default.
 
 If set to 1, both IPv4 and IPv6 settings are forced to inherit from
 current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
-forced to reset to their default values.
+forced to reset to their default values. If set to 3, both IPv4 and IPv6
+settings are forced to inherit from current ones in the netns where this
+new netns has been created.
 
 Default : 0  (for compatibility reasons)
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0ddb13a6282b..b109cc8a6dd8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -23,6 +23,7 @@
 #include <net/pkt_sched.h>
 
 static int two __maybe_unused = 2;
+static int three = 3;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
@@ -39,6 +40,7 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
  *     IPv6: reset all settings to default
  * 1 - Both inherit all current settings from init_net
  * 2 - Both reset all settings to default
+ * 3 - Both inherit all settings from current netns
  */
 int sysctl_devconf_inherit_init_net __read_mostly;
 EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
@@ -553,7 +555,7 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= &two,
+		.extra2		= &three,
 	},
 	{
 		.procname	= "high_order_alloc_disable",
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index fc94f82f82c7..f048d0a188b7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2666,11 +2666,24 @@ static __net_init int devinet_init_net(struct net *net)
 	tbl[0].extra2 = net;
 #endif
 
-	if ((!IS_ENABLED(CONFIG_SYSCTL) ||
-	     sysctl_devconf_inherit_init_net != 2) &&
-	    !net_eq(net, &init_net)) {
-		memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf));
-		memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt));
+	if (!net_eq(net, &init_net)) {
+		if (IS_ENABLED(CONFIG_SYSCTL) &&
+		    sysctl_devconf_inherit_init_net == 3) {
+			/* copy from the current netns */
+			memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
+			       sizeof(ipv4_devconf));
+			memcpy(dflt,
+			       current->nsproxy->net_ns->ipv4.devconf_dflt,
+			       sizeof(ipv4_devconf_dflt));
+		} else if (!IS_ENABLED(CONFIG_SYSCTL) ||
+			   sysctl_devconf_inherit_init_net != 2) {
+			/* inherit == 0 or 1: copy from init_net */
+			memcpy(all, init_net.ipv4.devconf_all,
+			       sizeof(ipv4_devconf));
+			memcpy(dflt, init_net.ipv4.devconf_dflt,
+			       sizeof(ipv4_devconf_dflt));
+		}
+		/* else inherit == 2: use compiled values */
 	}
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fd885f06c4ed..ab7e839753ae 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6991,9 +6991,26 @@ static int __net_init addrconf_init_net(struct net *net)
 		goto err_alloc_dflt;
 
 	if (IS_ENABLED(CONFIG_SYSCTL) &&
-	    sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) {
-		memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf));
-		memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt));
+	    !net_eq(net, &init_net)) {
+		switch (sysctl_devconf_inherit_init_net) {
+		case 1:  /* copy from init_net */
+			memcpy(all, init_net.ipv6.devconf_all,
+			       sizeof(ipv6_devconf));
+			memcpy(dflt, init_net.ipv6.devconf_dflt,
+			       sizeof(ipv6_devconf_dflt));
+			break;
+		case 3: /* copy from the current netns */
+			memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
+			       sizeof(ipv6_devconf));
+			memcpy(dflt,
+			       current->nsproxy->net_ns->ipv6.devconf_dflt,
+			       sizeof(ipv6_devconf_dflt));
+			break;
+		case 0:
+		case 2:
+			/* use compiled values */
+			break;
+		}
 	}
 
 	/* these will be inherited by all namespaces */
-- 
cgit v1.2.3


From a0c1d0eafd1ef1ada3b588ea205e5bc37ae0d8d9 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Thu, 14 May 2020 08:53:03 -0700
Subject: mptcp: Use 32-bit DATA_ACK when possible

RFC8684 allows to send 32-bit DATA_ACKs as long as the peer is not
sending 64-bit data-sequence numbers. The 64-bit DSN is only there for
extreme scenarios when a very high throughput subflow is combined with a
long-RTT subflow such that the high-throughput subflow wraps around the
32-bit sequence number space within an RTT of the high-RTT subflow.

It is thus a rare scenario and we should try to use the 32-bit DATA_ACK
instead as long as possible. It allows to reduce the TCP-option overhead
by 4 bytes, thus makes space for an additional SACK-block. It also makes
tcpdumps much easier to read when the DSN and DATA_ACK are both either
32 or 64-bit.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h  |  5 ++++-
 net/mptcp/options.c  | 33 ++++++++++++++++++++++++---------
 net/mptcp/protocol.h |  1 +
 net/mptcp/subflow.c  |  2 ++
 4 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index c4a6ef4ba35b..46d0487d2b22 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -16,7 +16,10 @@ struct seq_file;
 
 /* MPTCP sk_buff extension data */
 struct mptcp_ext {
-	u64		data_ack;
+	union {
+		u64	data_ack;
+		u32	data_ack32;
+	};
 	u64		data_seq;
 	u32		subflow_seq;
 	u16		data_len;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 45497af23906..ece6f92cf7d1 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -516,7 +516,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 		return ret;
 	}
 
-	ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+	if (subflow->use_64bit_ack) {
+		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+		opts->ext_copy.data_ack = msk->ack_seq;
+		opts->ext_copy.ack64 = 1;
+	} else {
+		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
+		opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
+		opts->ext_copy.ack64 = 0;
+	}
+	opts->ext_copy.use_ack = 1;
 
 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
 	if (dss_size == 0)
@@ -524,10 +533,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 
 	dss_size += ack_size;
 
-	opts->ext_copy.data_ack = msk->ack_seq;
-	opts->ext_copy.ack64 = 1;
-	opts->ext_copy.use_ack = 1;
-
 	*size = ALIGN(dss_size, 4);
 	return true;
 }
@@ -986,8 +991,13 @@ mp_capable_done:
 		u8 flags = 0;
 
 		if (mpext->use_ack) {
-			len += TCPOLEN_MPTCP_DSS_ACK64;
-			flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+			flags = MPTCP_DSS_HAS_ACK;
+			if (mpext->ack64) {
+				len += TCPOLEN_MPTCP_DSS_ACK64;
+				flags |= MPTCP_DSS_ACK64;
+			} else {
+				len += TCPOLEN_MPTCP_DSS_ACK32;
+			}
 		}
 
 		if (mpext->use_map) {
@@ -1004,8 +1014,13 @@ mp_capable_done:
 		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
 
 		if (mpext->use_ack) {
-			put_unaligned_be64(mpext->data_ack, ptr);
-			ptr += 2;
+			if (mpext->ack64) {
+				put_unaligned_be64(mpext->data_ack, ptr);
+				ptr += 2;
+			} else {
+				put_unaligned_be32(mpext->data_ack32, ptr);
+				ptr += 1;
+			}
 		}
 
 		if (mpext->use_map) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e4ca6320ce76..f5adca93e8fb 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -290,6 +290,7 @@ struct mptcp_subflow_context {
 		data_avail : 1,
 		rx_eof : 1,
 		data_fin_tx_enable : 1,
+		use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
 		can_ack : 1;	    /* only after processing the remote a key */
 	u64	data_fin_tx_seq;
 	u32	remote_nonce;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 53c75b0e5dce..0020d356233d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -667,9 +667,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
 	if (!mpext->dsn64) {
 		map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
 				     mpext->data_seq);
+		subflow->use_64bit_ack = 0;
 		pr_debug("expanded seq=%llu", subflow->map_seq);
 	} else {
 		map_seq = mpext->data_seq;
+		subflow->use_64bit_ack = 1;
 	}
 
 	if (subflow->map_valid) {
-- 
cgit v1.2.3


From 7be92514b99c15b89def6d72bbc84c354f89a025 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 15 May 2020 12:49:00 -0700
Subject: ethtool: check if there is at least one channel for TX/RX in the core

Having a channel config with no ability to RX or TX traffic is
clearly wrong. Check for this in the core so the drivers don't
have to.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/channels.c | 20 ++++++++++++++++++--
 net/ethtool/ioctl.c    |  5 +++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 389924b65d05..3aa4975919d7 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -129,13 +129,13 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1];
 	unsigned int from_channel, old_total, i;
+	bool mod = false, mod_combined = false;
 	struct ethtool_channels channels = {};
 	struct ethnl_req_info req_info = {};
 	const struct nlattr *err_attr;
 	const struct ethtool_ops *ops;
 	struct net_device *dev;
 	u32 max_rx_in_use = 0;
-	bool mod = false;
 	int ret;
 
 	ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
@@ -170,7 +170,8 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
 	ethnl_update_u32(&channels.other_count,
 			 tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod);
 	ethnl_update_u32(&channels.combined_count,
-			 tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod);
+			 tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined);
+	mod |= mod_combined;
 	ret = 0;
 	if (!mod)
 		goto out_ops;
@@ -193,6 +194,21 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
 		goto out_ops;
 	}
 
+	/* ensure there is at least one RX and one TX channel */
+	if (!channels.combined_count && !channels.rx_count)
+		err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT];
+	else if (!channels.combined_count && !channels.tx_count)
+		err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT];
+	else
+		err_attr = NULL;
+	if (err_attr) {
+		if (mod_combined)
+			err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT];
+		ret = -EINVAL;
+		NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested channel counts would result in no RX or TX channel being configured");
+		goto out_ops;
+	}
+
 	/* ensure the new Rx count fits within the configured Rx flow
 	 * indirection table settings
 	 */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 52102ab1709b..a574d60111fa 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1676,6 +1676,11 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	    channels.other_count > curr.max_other)
 		return -EINVAL;
 
+	/* ensure there is at least one RX and one TX channel */
+	if (!channels.combined_count &&
+	    (!channels.rx_count || !channels.tx_count))
+		return -EINVAL;
+
 	/* ensure the new Rx count fits within the configured Rx flow
 	 * indirection table settings */
 	if (netif_is_rxfh_configured(dev) &&
-- 
cgit v1.2.3


From 75c36dbb1c3790eab909344b758decce6bb432da Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 15 May 2020 12:49:02 -0700
Subject: ethtool: don't call set_channels in drivers if config didn't change

Don't call drivers if nothing changed. Netlink code already
contains this logic.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index a574d60111fa..eeb1137a3f23 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1669,6 +1669,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 
 	dev->ethtool_ops->get_channels(dev, &curr);
 
+	if (channels.rx_count == curr.rx_count &&
+	    channels.tx_count == curr.tx_count &&
+	    channels.combined_count == curr.combined_count &&
+	    channels.other_count == curr.other_count)
+		return 0;
+
 	/* ensure new counts are within the maximums */
 	if (channels.rx_count > curr.max_rx ||
 	    channels.tx_count > curr.max_tx ||
-- 
cgit v1.2.3


From a0e17064d43e445181bc004d949a4855ea8ccf9c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:17 +0200
Subject: mptcp: move common nospace-pattern to a helper

Paolo noticed that ssk_check_wmem() has same pattern, so add/use
common helper for both places.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1f52a0fa31ed..0413454fcdaf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -653,6 +653,15 @@ out:
 	return ret;
 }
 
+static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+{
+	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+
+	/* enables sk->write_space() callbacks */
+	set_bit(SOCK_NOSPACE, &sock->flags);
+}
+
 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 {
 	struct mptcp_subflow_context *subflow;
@@ -666,13 +675,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 		if (!sk_stream_memory_free(ssk)) {
 			struct socket *sock = ssk->sk_socket;
 
-			if (sock) {
-				clear_bit(MPTCP_SEND_SPACE, &msk->flags);
-				smp_mb__after_atomic();
-
-				/* enables sk->write_space() callbacks */
-				set_bit(SOCK_NOSPACE, &sock->flags);
-			}
+			if (sock)
+				mptcp_nospace(msk, sock);
 
 			return NULL;
 		}
@@ -698,13 +702,8 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
 		return;
 
 	sock = READ_ONCE(ssk->sk_socket);
-
-	if (sock) {
-		clear_bit(MPTCP_SEND_SPACE, &msk->flags);
-		smp_mb__after_atomic();
-		/* set NOSPACE only after clearing SEND_SPACE flag */
-		set_bit(SOCK_NOSPACE, &sock->flags);
-	}
+	if (sock)
+		mptcp_nospace(msk, sock);
 }
 
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
-- 
cgit v1.2.3


From fb529e62d3f3e85001108213dc323c35f2765575 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:18 +0200
Subject: mptcp: break and restart in case mptcp sndbuf is full

Its not enough to check for available tcp send space.

We also hold on to transmitted data for mptcp-level retransmits.
Right now we will send more and more data if the peer can ack data
at the tcp level fast enough, since that frees up tcp send buffer space.

But we also need to check that data was acked and reclaimed at the mptcp
level.

Therefore add needed check in mptcp_sendmsg, flush tcp data and
wait until more mptcp snd space becomes available if we are over the
limit.  Before we wait for more data, also make sure we start the
retransmit timer if we ran out of sndbuf space.

Otherwise there is a very small chance that we wait forever:

 * receiver is waiting for data
 * sender is blocked because mptcp socket buffer is full
 * at tcp level, all data was acked
 * mptcp-level snd_una was not updated, because last ack
   that acknowledged the last data packet carried an older
   MPTCP-ack.

Restarting the retransmit timer avoids this problem: if TCP
subflow is idle, data is retransmitted from the RTX queue.

New data will make the peer send a new, updated MPTCP-Ack.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 0413454fcdaf..75be8d662ac5 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -739,9 +739,23 @@ fallback:
 
 	mptcp_clean_una(sk);
 
+wait_for_sndbuf:
 	__mptcp_flush_join_list(msk);
 	ssk = mptcp_subflow_get_send(msk);
 	while (!sk_stream_memory_free(sk) || !ssk) {
+		if (ssk) {
+			/* make sure retransmit timer is
+			 * running before we wait for memory.
+			 *
+			 * The retransmit timer might be needed
+			 * to make the peer send an up-to-date
+			 * MPTCP Ack.
+			 */
+			mptcp_set_timeout(sk, ssk);
+			if (!mptcp_timer_pending(sk))
+				mptcp_reset_timer(sk);
+		}
+
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret)
 			goto out;
@@ -776,6 +790,28 @@ fallback:
 		}
 
 		copied += ret;
+
+		/* memory is charged to mptcp level socket as well, i.e.
+		 * if msg is very large, mptcp socket may run out of buffer
+		 * space.  mptcp_clean_una() will release data that has
+		 * been acked at mptcp level in the mean time, so there is
+		 * a good chance we can continue sending data right away.
+		 */
+		if (unlikely(!sk_stream_memory_free(sk))) {
+			tcp_push(ssk, msg->msg_flags, mss_now,
+				 tcp_sk(ssk)->nonagle, size_goal);
+			mptcp_clean_una(sk);
+			if (!sk_stream_memory_free(sk)) {
+				/* can't send more for now, need to wait for
+				 * MPTCP-level ACKs from peer.
+				 *
+				 * Wakeup will happen via mptcp_clean_una().
+				 */
+				mptcp_set_timeout(sk, ssk);
+				release_sock(ssk);
+				goto wait_for_sndbuf;
+			}
+		}
 	}
 
 	mptcp_set_timeout(sk, ssk);
-- 
cgit v1.2.3


From 72511aab95c94d7c0f03d0b7db5df47fdca059f6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:19 +0200
Subject: mptcp: avoid blocking in tcp_sendpages

The transmit loop continues to xmit new data until an error is returned
or all data was transmitted.

For the blocking i/o case, this means that tcp_sendpages() may block on
the subflow until more space becomes available, i.e. we end up sleeping
with the mptcp socket lock held.

Instead we should check if a different subflow is ready to be used.

This restarts the subflow sk lookup when the tx operation succeeded
and the tcp subflow can't accept more data or if tcp_sendpages
indicates -EAGAIN on a blocking mptcp socket.

In that case we also need to set the NOSPACE bit to make sure we get
notified once memory becomes available.

In case all subflows are busy, the existing logic will wait until a
subflow is ready, releasing the mptcp socket lock while doing so.

The mptcp worker already sets DONTWAIT, so no need to make changes there.

v2:
 * set NOSPACE bit
 * add a comment to clarify that mptcp-sk sndbuf limits need to
   be checked as well.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 75be8d662ac5..e97357066b21 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -590,7 +590,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	 * access the skb after the sendpages call
 	 */
 	ret = do_tcp_sendpages(ssk, page, offset, psize,
-			       msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+			       msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
 	if (ret <= 0)
 		return ret;
 
@@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	struct socket *ssock;
 	size_t copied = 0;
 	struct sock *ssk;
+	bool tx_ok;
 	long timeo;
 
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@@ -737,6 +738,7 @@ fallback:
 		return ret >= 0 ? ret + copied : (copied ? copied : ret);
 	}
 
+restart:
 	mptcp_clean_una(sk);
 
 wait_for_sndbuf:
@@ -772,11 +774,18 @@ wait_for_sndbuf:
 	pr_debug("conn_list->subflow=%p", ssk);
 
 	lock_sock(ssk);
-	while (msg_data_left(msg)) {
+	tx_ok = msg_data_left(msg);
+	while (tx_ok) {
 		ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
 					 &size_goal);
-		if (ret < 0)
+		if (ret < 0) {
+			if (ret == -EAGAIN && timeo > 0) {
+				mptcp_set_timeout(sk, ssk);
+				release_sock(ssk);
+				goto restart;
+			}
 			break;
+		}
 		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
 			/* Can happen for passive sockets:
 			 * 3WHS negotiated MPTCP, but first packet after is
@@ -791,11 +800,31 @@ wait_for_sndbuf:
 
 		copied += ret;
 
+		tx_ok = msg_data_left(msg);
+		if (!tx_ok)
+			break;
+
+		if (!sk_stream_memory_free(ssk)) {
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			tcp_push(ssk, msg->msg_flags, mss_now,
+				 tcp_sk(ssk)->nonagle, size_goal);
+			mptcp_set_timeout(sk, ssk);
+			release_sock(ssk);
+			goto restart;
+		}
+
 		/* memory is charged to mptcp level socket as well, i.e.
 		 * if msg is very large, mptcp socket may run out of buffer
 		 * space.  mptcp_clean_una() will release data that has
 		 * been acked at mptcp level in the mean time, so there is
 		 * a good chance we can continue sending data right away.
+		 *
+		 * Normally, when the tcp subflow can accept more data, then
+		 * so can the MPTCP socket.  However, we need to cope with
+		 * peers that might lag behind in their MPTCP-level
+		 * acknowledgements, i.e.  data might have been acked at
+		 * tcp level only.  So, we must also check the MPTCP socket
+		 * limits before we send more data.
 		 */
 		if (unlikely(!sk_stream_memory_free(sk))) {
 			tcp_push(ssk, msg->msg_flags, mss_now,
-- 
cgit v1.2.3


From 149f7c71e2c710a8ced836421a631953c9f84aa3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:20 +0200
Subject: mptcp: fill skb extension cache outside of mptcp_sendmsg_frag

The mptcp_sendmsg_frag helper contains a loop that will wait on the
subflow sk.

It seems preferrable to only wait in mptcp_sendmsg() when blocking io is
requested.  mptcp_sendmsg already has such a wait loop that is used when
no subflow socket is available for transmission.

This is a preparation patch that makes sure we call
mptcp_sendmsg_frag only if a skb extension has been allocated.

Moreover, such allocation currently uses GFP_ATOMIC while it
could use sleeping allocation instead.

Followup patches will remove the wait loop from mptcp_sendmsg_frag()
and will allow to do a sleeping allocation for the extension.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e97357066b21..1bdfbca1c23a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -669,6 +669,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 
 	sock_owned_by_me((const struct sock *)msk);
 
+	if (!mptcp_ext_cache_refill(msk))
+		return NULL;
+
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
@@ -804,7 +807,8 @@ wait_for_sndbuf:
 		if (!tx_ok)
 			break;
 
-		if (!sk_stream_memory_free(ssk)) {
+		if (!sk_stream_memory_free(ssk) ||
+		    !mptcp_ext_cache_refill(msk)) {
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			tcp_push(ssk, msg->msg_flags, mss_now,
 				 tcp_sk(ssk)->nonagle, size_goal);
@@ -1158,7 +1162,7 @@ static void mptcp_worker(struct work_struct *work)
 {
 	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
 	struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
-	int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+	int orig_len, orig_offset, mss_now = 0, size_goal = 0;
 	struct mptcp_data_frag *dfrag;
 	u64 orig_write_seq;
 	size_t copied = 0;
@@ -1180,6 +1184,9 @@ static void mptcp_worker(struct work_struct *work)
 	if (!dfrag)
 		goto unlock;
 
+	if (!mptcp_ext_cache_refill(msk))
+		goto reset_unlock;
+
 	ssk = mptcp_subflow_get_retrans(msk);
 	if (!ssk)
 		goto reset_unlock;
@@ -1191,8 +1198,8 @@ static void mptcp_worker(struct work_struct *work)
 	orig_offset = dfrag->offset;
 	orig_write_seq = dfrag->data_seq;
 	while (dfrag->data_len > 0) {
-		ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
-					 &size_goal);
+		int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
+					     &mss_now, &size_goal);
 		if (ret < 0)
 			break;
 
@@ -1200,6 +1207,9 @@ static void mptcp_worker(struct work_struct *work)
 		copied += ret;
 		dfrag->data_len -= ret;
 		dfrag->offset += ret;
+
+		if (!mptcp_ext_cache_refill(msk))
+			break;
 	}
 	if (copied)
 		tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
-- 
cgit v1.2.3


From 17091708d1e503383f20934631305ccb375b0eb1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:21 +0200
Subject: mptcp: fill skb page frag cache outside of mptcp_sendmsg_frag

The mptcp_sendmsg_frag helper contains a loop that will wait on the
subflow sk.

It seems preferrable to only wait in mptcp_sendmsg() when blocking io is
requested.  mptcp_sendmsg already has such a wait loop that is used when
no subflow socket is available for transmission.

This is another preparation patch that makes sure we call
mptcp_sendmsg_frag only if the page frag cache has been refilled.

Followup patch will remove the wait loop from mptcp_sendmsg_frag().

The retransmit worker doesn't need to do this refill as it won't
transmit new mptcp-level data.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1bdfbca1c23a..a11e51222e59 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -713,6 +713,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	int mss_now = 0, size_goal = 0, ret = 0;
 	struct mptcp_sock *msk = mptcp_sk(sk);
+	struct page_frag *pfrag;
 	struct socket *ssock;
 	size_t copied = 0;
 	struct sock *ssk;
@@ -741,13 +742,16 @@ fallback:
 		return ret >= 0 ? ret + copied : (copied ? copied : ret);
 	}
 
+	pfrag = sk_page_frag(sk);
 restart:
 	mptcp_clean_una(sk);
 
 wait_for_sndbuf:
 	__mptcp_flush_join_list(msk);
 	ssk = mptcp_subflow_get_send(msk);
-	while (!sk_stream_memory_free(sk) || !ssk) {
+	while (!sk_stream_memory_free(sk) ||
+	       !ssk ||
+	       !mptcp_page_frag_refill(ssk, pfrag)) {
 		if (ssk) {
 			/* make sure retransmit timer is
 			 * running before we wait for memory.
@@ -808,6 +812,7 @@ wait_for_sndbuf:
 			break;
 
 		if (!sk_stream_memory_free(ssk) ||
+		    !mptcp_page_frag_refill(ssk, pfrag) ||
 		    !mptcp_ext_cache_refill(msk)) {
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			tcp_push(ssk, msg->msg_flags, mss_now,
-- 
cgit v1.2.3


From 5c8264435d4f6a056ac926989a827aba1961e3c8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:22 +0200
Subject: mptcp: remove inner wait loop from mptcp_sendmsg_frag

previous patches made sure we only call into this function
when these prerequisites are met, so no need to wait on the
subflow socket anymore.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/7
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a11e51222e59..bc950cf818f7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -510,20 +510,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	 * fooled into a warning if we don't init here
 	 */
 	pfrag = sk_page_frag(sk);
-	while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
-	       !mptcp_ext_cache_refill(msk)) {
-		ret = sk_stream_wait_memory(ssk, timeo);
-		if (ret)
-			return ret;
-
-		/* if sk_stream_wait_memory() sleeps snd_una can change
-		 * significantly, refresh the rtx queue
-		 */
-		mptcp_clean_una(sk);
-
-		if (unlikely(__mptcp_needs_tcp_fallback(msk)))
-			return 0;
-	}
 	if (!retransmission) {
 		write_seq = &msk->write_seq;
 		page = pfrag->page;
-- 
cgit v1.2.3


From 4930f4831b1547b52c5968e9307fe3d840d7fba0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 16 May 2020 10:46:23 +0200
Subject: net: allow __skb_ext_alloc to sleep

mptcp calls this from the transmit side, from process context.
Allow a sleeping allocation instead of unconditional GFP_ATOMIC.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 8 +++++---
 net/mptcp/protocol.c   | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3000c526f552..531843952809 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4165,7 +4165,7 @@ struct skb_ext {
 	char data[] __aligned(8);
 };
 
-struct skb_ext *__skb_ext_alloc(void);
+struct skb_ext *__skb_ext_alloc(gfp_t flags);
 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
 		    struct skb_ext *ext);
 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1bf0c3d278e7..35a133c6d13b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6087,13 +6087,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
 /**
  * __skb_ext_alloc - allocate a new skb extensions storage
  *
+ * @flags: See kmalloc().
+ *
  * Returns the newly allocated pointer. The pointer can later attached to a
  * skb via __skb_ext_set().
  * Note: caller must handle the skb_ext as an opaque data.
  */
-struct skb_ext *__skb_ext_alloc(void)
+struct skb_ext *__skb_ext_alloc(gfp_t flags)
 {
-	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
 
 	if (new) {
 		memset(new->offset, 0, sizeof(new->offset));
@@ -6188,7 +6190,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
 	} else {
 		newoff = SKB_EXT_CHUNKSIZEOF(*new);
 
-		new = __skb_ext_alloc();
+		new = __skb_ext_alloc(GFP_ATOMIC);
 		if (!new)
 			return NULL;
 	}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bc950cf818f7..e3a628bea2b8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -367,8 +367,10 @@ static void mptcp_stop_timer(struct sock *sk)
 
 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
 {
+	const struct sock *sk = (const struct sock *)msk;
+
 	if (!msk->cached_ext)
-		msk->cached_ext = __skb_ext_alloc();
+		msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
 
 	return !!msk->cached_ext;
 }
-- 
cgit v1.2.3


From dbfe7d74376e187f3c6eaff822e85176bc2cd06e Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Sat, 16 May 2020 18:23:36 -0700
Subject: rds: convert get_user_pages() --> pin_user_pages()

This code was using get_user_pages_fast(), in a "Case 2" scenario
(DMA/RDMA), using the categorization from [1]. That means that it's
time to convert the get_user_pages_fast() + put_page() calls to
pin_user_pages_fast() + unpin_user_pages() calls.

There is some helpful background in [2]: basically, this is a small
part of fixing a long-standing disconnect between pinning pages, and
file systems' use of those pages.

[1] Documentation/core-api/pin_user_pages.rst

[2] "Explicit pinning of user-space pages":
    https://lwn.net/Articles/807108/

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: rds-devel@oss.oracle.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/info.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/info.c b/net/rds/info.c
index 03f6fd56d237..e1d63563e81c 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -162,7 +162,6 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 	struct rds_info_lengths lens;
 	unsigned long nr_pages = 0;
 	unsigned long start;
-	unsigned long i;
 	rds_info_func func;
 	struct page **pages = NULL;
 	int ret;
@@ -193,7 +192,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
+	ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
 	if (ret != nr_pages) {
 		if (ret > 0)
 			nr_pages = ret;
@@ -235,8 +234,7 @@ call_func:
 		ret = -EFAULT;
 
 out:
-	for (i = 0; pages && i < nr_pages; i++)
-		put_page(pages[i]);
+	unpin_user_pages(pages, nr_pages);
 	kfree(pages);
 
 	return ret;
-- 
cgit v1.2.3


From 49c06c9eb14ba61725c1c82e5107f4e4bd6c1886 Mon Sep 17 00:00:00 2001
From: Łukasz Rymanowski <lukasz.rymanowski@codecoup.pl>
Date: Wed, 13 May 2020 10:18:53 +0200
Subject: Bluetooth: Fix for GAP/SEC/SEM/BI-10-C
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Security Mode 1 level 4, force us to use have key size 16 octects long.
This patch adds check for that.

This is required for the qualification test GAP/SEC/SEM/BI-10-C

Logs from test when ATT is configured with sec level BT_SECURITY_FIPS

< ACL Data TX: Handle 3585 flags 0x00 dlen 11                                                                       #28 [hci0] 3.785965
      SMP: Pairing Request (0x01) len 6
        IO capability: DisplayYesNo (0x01)
        OOB data: Authentication data not present (0x00)
        Authentication requirement: Bonding, MITM, SC, No Keypresses (0x0d)
        Max encryption key size: 16
        Initiator key distribution: EncKey Sign (0x05)
        Responder key distribution: EncKey IdKey Sign (0x07)
> ACL Data RX: Handle 3585 flags 0x02 dlen 11                                                                       #35 [hci0] 3.883020
      SMP: Pairing Response (0x02) len 6
        IO capability: DisplayYesNo (0x01)
        OOB data: Authentication data not present (0x00)
        Authentication requirement: Bonding, MITM, SC, No Keypresses (0x0d)
        Max encryption key size: 7
        Initiator key distribution: EncKey Sign (0x05)
        Responder key distribution: EncKey IdKey Sign (0x07)
< ACL Data TX: Handle 3585 flags 0x00 dlen 6                                                                        #36 [hci0] 3.883136
      SMP: Pairing Failed (0x05) len 1
        Reason: Encryption key size (0x06)

Signed-off-by: Łukasz Rymanowski <lukasz.rymanowski@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/smp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 5510017cf9ff..6fd9ddb2d85c 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -730,6 +730,10 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
 	struct hci_dev *hdev = conn->hcon->hdev;
 	struct smp_chan *smp = chan->data;
 
+	if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS &&
+	    max_key_size != SMP_MAX_ENC_KEY_SIZE)
+		return SMP_ENC_KEY_SIZE;
+
 	if (max_key_size > hdev->le_max_key_size ||
 	    max_key_size < SMP_MIN_ENC_KEY_SIZE)
 		return SMP_ENC_KEY_SIZE;
-- 
cgit v1.2.3


From 56b5453a86203a44726f523b4133c1feca49ce7c Mon Sep 17 00:00:00 2001
From: Hsin-Yu Chao <hychao@chromium.org>
Date: Fri, 15 May 2020 17:27:04 +0800
Subject: Bluetooth: Add SCO fallback for invalid LMP parameters error

Bluetooth PTS test case HFP/AG/ACC/BI-12-I accepts SCO connection
with invalid parameter at the first SCO request expecting AG to
attempt another SCO request with the use of "safe settings" for
given codec, base on section 5.7.1.2 of HFP 1.7 specification.

This patch addresses it by adding "Invalid LMP Parameters" (0x1e)
to the SCO fallback case. Verified with below log:

< HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17
        Handle: 256
        Transmit bandwidth: 8000
        Receive bandwidth: 8000
        Max latency: 13
        Setting: 0x0003
          Input Coding: Linear
          Input Data Format: 1's complement
          Input Sample Size: 8-bit
          # of bits padding at MSB: 0
          Air Coding Format: Transparent Data
        Retransmission effort: Optimize for link quality (0x02)
        Packet type: 0x0380
          3-EV3 may not be used
          2-EV5 may not be used
          3-EV5 may not be used
> HCI Event: Command Status (0x0f) plen 4
      Setup Synchronous Connection (0x01|0x0028) ncmd 1
        Status: Success (0x00)
> HCI Event: Number of Completed Packets (0x13) plen 5
        Num handles: 1
        Handle: 256
        Count: 1
> HCI Event: Max Slots Change (0x1b) plen 3
        Handle: 256
        Max slots: 1
> HCI Event: Synchronous Connect Complete (0x2c) plen 17
        Status: Invalid LMP Parameters / Invalid LL Parameters (0x1e)
        Handle: 0
        Address: 00:1B:DC:F2:21:59 (OUI 00-1B-DC)
        Link type: eSCO (0x02)
        Transmission interval: 0x00
        Retransmission window: 0x02
        RX packet length: 0
        TX packet length: 0
        Air mode: Transparent (0x03)
< HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17
        Handle: 256
        Transmit bandwidth: 8000
        Receive bandwidth: 8000
        Max latency: 8
        Setting: 0x0003
          Input Coding: Linear
          Input Data Format: 1's complement
          Input Sample Size: 8-bit
          # of bits padding at MSB: 0
          Air Coding Format: Transparent Data
        Retransmission effort: Optimize for link quality (0x02)
        Packet type: 0x03c8
          EV3 may be used
          2-EV3 may not be used
          3-EV3 may not be used
          2-EV5 may not be used
          3-EV5 may not be used
> HCI Event: Command Status (0x0f) plen 4
      Setup Synchronous Connection (0x01|0x0028) ncmd 1
        Status: Success (0x00)
> HCI Event: Max Slots Change (0x1b) plen 3
        Handle: 256
        Max slots: 5
> HCI Event: Max Slots Change (0x1b) plen 3
        Handle: 256
        Max slots: 1
> HCI Event: Synchronous Connect Complete (0x2c) plen 17
        Status: Success (0x00)
        Handle: 257
        Address: 00:1B:DC:F2:21:59 (OUI 00-1B-DC)
        Link type: eSCO (0x02)
        Transmission interval: 0x06
        Retransmission window: 0x04
        RX packet length: 30
        TX packet length: 30
        Air mode: Transparent (0x03)

Signed-off-by: Hsin-Yu Chao <hychao@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 73aabca0064b..f024b3d57a1c 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4337,6 +4337,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
 	case 0x11:	/* Unsupported Feature or Parameter Value */
 	case 0x1c:	/* SCO interval rejected */
 	case 0x1a:	/* Unsupported Remote Feature */
+	case 0x1e:	/* Invalid LMP Parameters */
 	case 0x1f:	/* Unspecified error */
 	case 0x20:	/* Unsupported LMP Parameter value */
 		if (conn->out) {
-- 
cgit v1.2.3


From 7c1552da900c159a13473a2106c7547746ebe4a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2020 08:28:05 +0200
Subject: ipv6: lift copy_from_user out of ipv6_route_ioctl

Prepare for better compat ioctl handling by moving the user copy out
of ipv6_route_ioctl.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  3 ++-
 net/ipv6/af_inet6.c     | 16 ++++++++++------
 net/ipv6/route.c        | 44 ++++++++++++++++----------------------------
 3 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index e525f003e619..2a5277758379 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -118,7 +118,8 @@ void ip6_route_init_special_entries(void);
 int ip6_route_init(void);
 void ip6_route_cleanup(void);
 
-int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
+int ipv6_route_ioctl(struct net *net, unsigned int cmd,
+		struct in6_rtmsg *rtmsg);
 
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 771a462a8322..a618beb9b6d5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -542,21 +542,25 @@ EXPORT_SYMBOL(inet6_getname);
 
 int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
+	void __user *argp = (void __user *)arg;
 	struct sock *sk = sock->sk;
 	struct net *net = sock_net(sk);
 
 	switch (cmd) {
 	case SIOCADDRT:
-	case SIOCDELRT:
-
-		return ipv6_route_ioctl(net, cmd, (void __user *)arg);
+	case SIOCDELRT: {
+		struct in6_rtmsg rtmsg;
 
+		if (copy_from_user(&rtmsg, argp, sizeof(rtmsg)))
+			return -EFAULT;
+		return ipv6_route_ioctl(net, cmd, &rtmsg);
+	}
 	case SIOCSIFADDR:
-		return addrconf_add_ifaddr(net, (void __user *) arg);
+		return addrconf_add_ifaddr(net, argp);
 	case SIOCDIFADDR:
-		return addrconf_del_ifaddr(net, (void __user *) arg);
+		return addrconf_del_ifaddr(net, argp);
 	case SIOCSIFDSTADDR:
-		return addrconf_set_dstaddr(net, (void __user *) arg);
+		return addrconf_set_dstaddr(net, argp);
 	default:
 		if (!sk->sk_prot->ioctl)
 			return -ENOIOCTLCMD;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a8b4add0b545..a52ec1b86432 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4336,41 +4336,29 @@ static void rtmsg_to_fib6_config(struct net *net,
 	};
 }
 
-int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
 {
 	struct fib6_config cfg;
-	struct in6_rtmsg rtmsg;
 	int err;
 
-	switch (cmd) {
-	case SIOCADDRT:		/* Add a route */
-	case SIOCDELRT:		/* Delete a route */
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			return -EPERM;
-		err = copy_from_user(&rtmsg, arg,
-				     sizeof(struct in6_rtmsg));
-		if (err)
-			return -EFAULT;
+	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+		return -EINVAL;
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
 
-		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
+	rtmsg_to_fib6_config(net, rtmsg, &cfg);
 
-		rtnl_lock();
-		switch (cmd) {
-		case SIOCADDRT:
-			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
-			break;
-		case SIOCDELRT:
-			err = ip6_route_del(&cfg, NULL);
-			break;
-		default:
-			err = -EINVAL;
-		}
-		rtnl_unlock();
-
-		return err;
+	rtnl_lock();
+	switch (cmd) {
+	case SIOCADDRT:
+		err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
+		break;
+	case SIOCDELRT:
+		err = ip6_route_del(&cfg, NULL);
+		break;
 	}
-
-	return -EINVAL;
+	rtnl_unlock();
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 3986912f6a9aae50945fc9d3513c621381eba1aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2020 08:28:06 +0200
Subject: ipv6: move SIOCADDRT and SIOCDELRT handling into ->compat_ioctl

To prepare removing the global routing_ioctl hack start lifting the code
into a newly added ipv6 ->compat_ioctl handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h   |  2 ++
 net/dccp/ipv6.c      |  1 +
 net/ipv6/af_inet6.c  | 53 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/raw.c       |  1 +
 net/l2tp/l2tp_ip6.c  |  1 +
 net/mptcp/protocol.c |  1 +
 net/sctp/ipv6.c      |  1 +
 net/socket.c         | 63 ++++++++++++++--------------------------------------
 8 files changed, 77 insertions(+), 46 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 955badd1e8ff..5fc3a9d7b053 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1115,6 +1115,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer);
 int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
+		unsigned long arg);
 
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 			      struct sock *sk);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 1e5e08cc0bfc..650187d68851 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1082,6 +1082,7 @@ static const struct proto_ops inet6_dccp_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a618beb9b6d5..b69496eaf922 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -60,6 +60,7 @@
 #include <net/calipso.h>
 #include <net/seg6.h>
 #include <net/rpl.h>
+#include <net/compat.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
@@ -571,6 +572,56 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL(inet6_ioctl);
 
+#ifdef CONFIG_COMPAT
+struct compat_in6_rtmsg {
+	struct in6_addr		rtmsg_dst;
+	struct in6_addr		rtmsg_src;
+	struct in6_addr		rtmsg_gateway;
+	u32			rtmsg_type;
+	u16			rtmsg_dst_len;
+	u16			rtmsg_src_len;
+	u32			rtmsg_metric;
+	u32			rtmsg_info;
+	u32			rtmsg_flags;
+	s32			rtmsg_ifindex;
+};
+
+static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+		struct compat_in6_rtmsg __user *ur)
+{
+	struct in6_rtmsg rt;
+
+	if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst,
+			3 * sizeof(struct in6_addr)) ||
+	    get_user(rt.rtmsg_type, &ur->rtmsg_type) ||
+	    get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) ||
+	    get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) ||
+	    get_user(rt.rtmsg_metric, &ur->rtmsg_metric) ||
+	    get_user(rt.rtmsg_info, &ur->rtmsg_info) ||
+	    get_user(rt.rtmsg_flags, &ur->rtmsg_flags) ||
+	    get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex))
+		return -EFAULT;
+
+
+	return ipv6_route_ioctl(sock_net(sk), cmd, &rt);
+}
+
+int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = compat_ptr(arg);
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+	case SIOCADDRT:
+	case SIOCDELRT:
+		return inet6_compat_routing_ioctl(sk, cmd, argp);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+EXPORT_SYMBOL_GPL(inet6_compat_ioctl);
+#endif /* CONFIG_COMPAT */
+
 INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
 					    size_t));
 int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
@@ -632,6 +683,7 @@ const struct proto_ops inet6_stream_ops = {
 	.read_sock	   = tcp_read_sock,
 	.peek_len	   = tcp_peek_len,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
@@ -660,6 +712,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.sendpage	   = sock_no_sendpage,
 	.set_peek_off	   = sk_set_peek_off,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0028aa1d7869..8ef5a7b30524 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1377,6 +1377,7 @@ const struct proto_ops inet6_sockraw_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index d148766f40d1..fdfef926c591 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -758,6 +758,7 @@ static const struct proto_ops l2tp_ip6_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e3a628bea2b8..ba9d3d5c625f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2068,6 +2068,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c87af430107a..ccfa0ab3e7f4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1032,6 +1032,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 #ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet6_compat_ioctl,
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
diff --git a/net/socket.c b/net/socket.c
index 1c9a7260a41d..682447075775 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3384,62 +3384,33 @@ struct rtentry32 {
 	unsigned short  rt_irtt;        /* Initial RTT                  */
 };
 
-struct in6_rtmsg32 {
-	struct in6_addr		rtmsg_dst;
-	struct in6_addr		rtmsg_src;
-	struct in6_addr		rtmsg_gateway;
-	u32			rtmsg_type;
-	u16			rtmsg_dst_len;
-	u16			rtmsg_src_len;
-	u32			rtmsg_metric;
-	u32			rtmsg_info;
-	u32			rtmsg_flags;
-	s32			rtmsg_ifindex;
-};
-
 static int routing_ioctl(struct net *net, struct socket *sock,
 			 unsigned int cmd, void __user *argp)
 {
+	struct rtentry32 __user *ur4 = argp;
 	int ret;
 	void *r = NULL;
-	struct in6_rtmsg r6;
 	struct rtentry r4;
 	char devname[16];
 	u32 rtdev;
 	mm_segment_t old_fs = get_fs();
 
-	if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
-		struct in6_rtmsg32 __user *ur6 = argp;
-		ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
-			3 * sizeof(struct in6_addr));
-		ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
-		ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-		ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-		ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
-		ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
-		ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
-		ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
-
-		r = (void *) &r6;
-	} else { /* ipv4 */
-		struct rtentry32 __user *ur4 = argp;
-		ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
-					3 * sizeof(struct sockaddr));
-		ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
-		ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
-		ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
-		ret |= get_user(r4.rt_window, &(ur4->rt_window));
-		ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
-		ret |= get_user(rtdev, &(ur4->rt_dev));
-		if (rtdev) {
-			ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
-			r4.rt_dev = (char __user __force *)devname;
-			devname[15] = 0;
-		} else
-			r4.rt_dev = NULL;
-
-		r = (void *) &r4;
-	}
+	ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
+				3 * sizeof(struct sockaddr));
+	ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
+	ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
+	ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
+	ret |= get_user(r4.rt_window, &(ur4->rt_window));
+	ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
+	ret |= get_user(rtdev, &(ur4->rt_dev));
+	if (rtdev) {
+		ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
+		r4.rt_dev = (char __user __force *)devname;
+		devname[15] = 0;
+	} else
+		r4.rt_dev = NULL;
+
+	r = (void *) &r4;
 
 	if (ret) {
 		ret = -EFAULT;
-- 
cgit v1.2.3


From a50049235483b0337d129e2878e99750e4da3ac2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2020 08:28:07 +0200
Subject: appletalk: factor out a atrtr_ioctl_addrt helper

Add a helper than can be shared with the upcoming compat ioctl handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/ddp.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index b41375d4d295..4177a74f6543 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -867,6 +867,24 @@ static int atif_ioctl(int cmd, void __user *arg)
 	return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
 }
 
+static int atrtr_ioctl_addrt(struct rtentry *rt)
+{
+	struct net_device *dev = NULL;
+
+	if (rt->rt_dev) {
+		char name[IFNAMSIZ];
+
+		if (copy_from_user(name, rt->rt_dev, IFNAMSIZ-1))
+			return -EFAULT;
+		name[IFNAMSIZ-1] = '\0';
+
+		dev = __dev_get_by_name(&init_net, name);
+		if (!dev)
+			return -ENODEV;
+	}
+	return atrtr_create(rt, dev);
+}
+
 /* Routing ioctl() calls */
 static int atrtr_ioctl(unsigned int cmd, void __user *arg)
 {
@@ -882,19 +900,8 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg)
 		return atrtr_delete(&((struct sockaddr_at *)
 				      &rt.rt_dst)->sat_addr);
 
-	case SIOCADDRT: {
-		struct net_device *dev = NULL;
-		if (rt.rt_dev) {
-			char name[IFNAMSIZ];
-			if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1))
-				return -EFAULT;
-			name[IFNAMSIZ-1] = '\0';
-			dev = __dev_get_by_name(&init_net, name);
-			if (!dev)
-				return -ENODEV;
-		}
-		return atrtr_create(&rt, dev);
-	}
+	case SIOCADDRT:
+		return atrtr_ioctl_addrt(&rt);
 	}
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From dc13c8761c91c06acd3d98cd107f371cba9811b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2020 08:28:08 +0200
Subject: ipv4,appletalk: move SIOCADDRT and SIOCDELRT handling into
 ->compat_ioctl

To prepare removing the global routing_ioctl hack start lifting the code
into the ipv4 and appletalk ->compat_ioctl handlers.  Unlike the existing
handler we don't bother copying in the name - there are no compat issues for
char arrays.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/compat.h | 18 +++++++++++++++
 net/appletalk/ddp.c  | 49 ++++++++++++++++++++++++++++++++++++-----
 net/ipv4/af_inet.c   | 38 +++++++++++++++++++++++++++-----
 net/socket.c         | 62 ----------------------------------------------------
 4 files changed, 94 insertions(+), 73 deletions(-)

(limited to 'net')

diff --git a/include/net/compat.h b/include/net/compat.h
index e341260642fe..2b5e1f7ba153 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -30,6 +30,24 @@ struct compat_cmsghdr {
 	compat_int_t	cmsg_type;
 };
 
+struct compat_rtentry {
+	u32		rt_pad1;
+	struct sockaddr rt_dst;         /* target address               */
+	struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
+	struct sockaddr rt_genmask;     /* target network mask (IP)     */
+	unsigned short	rt_flags;
+	short		rt_pad2;
+	u32		rt_pad3;
+	unsigned char	rt_tos;
+	unsigned char	rt_class;
+	short		rt_pad4;
+	short		rt_metric;      /* +1 for binary compatibility! */
+	compat_uptr_t	rt_dev;         /* forcing the device at add    */
+	u32		rt_mtu;         /* per route MTU/Window         */
+	u32		rt_window;      /* Window clamping              */
+	unsigned short  rt_irtt;        /* Initial RTT                  */
+};
+
 #else /* defined(CONFIG_COMPAT) */
 /*
  * To avoid compiler warnings:
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 4177a74f6543..15787e8c0629 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -57,6 +57,7 @@
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/route.h>
+#include <net/compat.h>
 #include <linux/atalk.h>
 #include <linux/highmem.h>
 
@@ -1839,20 +1840,58 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 
 #ifdef CONFIG_COMPAT
+static int atalk_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+		struct compat_rtentry __user *ur)
+{
+	compat_uptr_t rtdev;
+	struct rtentry rt;
+
+	if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+			3 * sizeof(struct sockaddr)) ||
+	    get_user(rt.rt_flags, &ur->rt_flags) ||
+	    get_user(rt.rt_metric, &ur->rt_metric) ||
+	    get_user(rt.rt_mtu, &ur->rt_mtu) ||
+	    get_user(rt.rt_window, &ur->rt_window) ||
+	    get_user(rt.rt_irtt, &ur->rt_irtt) ||
+	    get_user(rtdev, &ur->rt_dev))
+		return -EFAULT;
+
+	switch (cmd) {
+	case SIOCDELRT:
+		if (rt.rt_dst.sa_family != AF_APPLETALK)
+			return -EINVAL;
+		return atrtr_delete(&((struct sockaddr_at *)
+				      &rt.rt_dst)->sat_addr);
+
+	case SIOCADDRT:
+		rt.rt_dev = compat_ptr(rtdev);
+		return atrtr_ioctl_addrt(&rt);
+	default:
+		return -EINVAL;
+	}
+}
 static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
+	void __user *argp = compat_ptr(arg);
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+	case SIOCADDRT:
+	case SIOCDELRT:
+		return atalk_compat_routing_ioctl(sk, cmd, argp);
 	/*
 	 * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we
 	 * cannot handle it in common code. The data we access if ifreq
 	 * here is compatible, so we can simply call the native
 	 * handler.
 	 */
-	if (cmd == SIOCATALKDIFADDR)
-		return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
-
-	return -ENOIOCTLCMD;
+	case SIOCATALKDIFADDR:
+		return atalk_ioctl(sock, cmd, (unsigned long)argp);
+	default:
+		return -ENOIOCTLCMD;
+	}
 }
-#endif
+#endif /* CONFIG_COMPAT */
 
 
 static const struct net_proto_family atalk_family_ops = {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fcf0d12a407a..c35a8b2e0499 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -116,6 +116,7 @@
 #include <linux/mroute.h>
 #endif
 #include <net/l3mdev.h>
+#include <net/compat.h>
 
 #include <trace/events/sock.h>
 
@@ -970,17 +971,42 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 EXPORT_SYMBOL(inet_ioctl);
 
 #ifdef CONFIG_COMPAT
+static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+		struct compat_rtentry __user *ur)
+{
+	compat_uptr_t rtdev;
+	struct rtentry rt;
+
+	if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+			3 * sizeof(struct sockaddr)) ||
+	    get_user(rt.rt_flags, &ur->rt_flags) ||
+	    get_user(rt.rt_metric, &ur->rt_metric) ||
+	    get_user(rt.rt_mtu, &ur->rt_mtu) ||
+	    get_user(rt.rt_window, &ur->rt_window) ||
+	    get_user(rt.rt_irtt, &ur->rt_irtt) ||
+	    get_user(rtdev, &ur->rt_dev))
+		return -EFAULT;
+
+	rt.rt_dev = compat_ptr(rtdev);
+	return ip_rt_ioctl(sock_net(sk), cmd, &rt);
+}
+
 static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
+	void __user *argp = compat_ptr(arg);
 	struct sock *sk = sock->sk;
-	int err = -ENOIOCTLCMD;
-
-	if (sk->sk_prot->compat_ioctl)
-		err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
 
-	return err;
+	switch (cmd) {
+	case SIOCADDRT:
+	case SIOCDELRT:
+		return inet_compat_routing_ioctl(sk, cmd, argp);
+	default:
+		if (!sk->sk_prot->compat_ioctl)
+			return -ENOIOCTLCMD;
+		return sk->sk_prot->compat_ioctl(sk, cmd, arg);
+	}
 }
-#endif
+#endif /* CONFIG_COMPAT */
 
 const struct proto_ops inet_stream_ops = {
 	.family		   = PF_INET,
diff --git a/net/socket.c b/net/socket.c
index 682447075775..80422fc3c836 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3366,65 +3366,6 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 	return err;
 }
 
-struct rtentry32 {
-	u32		rt_pad1;
-	struct sockaddr rt_dst;         /* target address               */
-	struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-	struct sockaddr rt_genmask;     /* target network mask (IP)     */
-	unsigned short	rt_flags;
-	short		rt_pad2;
-	u32		rt_pad3;
-	unsigned char	rt_tos;
-	unsigned char	rt_class;
-	short		rt_pad4;
-	short		rt_metric;      /* +1 for binary compatibility! */
-	/* char * */ u32 rt_dev;        /* forcing the device at add    */
-	u32		rt_mtu;         /* per route MTU/Window         */
-	u32		rt_window;      /* Window clamping              */
-	unsigned short  rt_irtt;        /* Initial RTT                  */
-};
-
-static int routing_ioctl(struct net *net, struct socket *sock,
-			 unsigned int cmd, void __user *argp)
-{
-	struct rtentry32 __user *ur4 = argp;
-	int ret;
-	void *r = NULL;
-	struct rtentry r4;
-	char devname[16];
-	u32 rtdev;
-	mm_segment_t old_fs = get_fs();
-
-	ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
-				3 * sizeof(struct sockaddr));
-	ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
-	ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
-	ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
-	ret |= get_user(r4.rt_window, &(ur4->rt_window));
-	ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
-	ret |= get_user(rtdev, &(ur4->rt_dev));
-	if (rtdev) {
-		ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
-		r4.rt_dev = (char __user __force *)devname;
-		devname[15] = 0;
-	} else
-		r4.rt_dev = NULL;
-
-	r = (void *) &r4;
-
-	if (ret) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	set_fs(KERNEL_DS);
-	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
-	set_fs(old_fs);
-
-out:
-	return ret;
-}
-
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
  * for some operations; this forces use of the newer bridge-utils that
  * use compatible ioctls
@@ -3463,9 +3404,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGIFMAP:
 	case SIOCSIFMAP:
 		return compat_sioc_ifmap(net, cmd, argp);
-	case SIOCADDRT:
-	case SIOCDELRT:
-		return routing_ioctl(net, sock, cmd, argp);
 	case SIOCGSTAMP_OLD:
 	case SIOCGSTAMPNS_OLD:
 		if (!sock->ops->gettstamp)
-- 
cgit v1.2.3


From d800bad67d4c21aaf11722f04e0f7547fb915ab5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 18 May 2020 15:05:27 +0200
Subject: bpf: Fix too large copy from user in bpf_test_init

Commit bc56c919fce7 ("bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp().")
recently changed bpf_prog_test_run_xdp() to use larger frames for XDP in
order to test tail growing frames (via bpf_xdp_adjust_tail) and to have
memory backing frame better resemble drivers.

The commit contains a bug, as it tries to copy the max data size from
userspace, instead of the size provided by userspace.  This cause XDP
unit tests to fail sporadically with EFAULT, an unfortunate behavior.
The fix is to only copy the size specified by userspace.

Fixes: bc56c919fce7 ("bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp().")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/158980712729.256597.6115007718472928659.stgit@firesoul
---
 net/bpf/test_run.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 30ba7d38941d..bfd4ccd80847 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 			   u32 headroom, u32 tailroom)
 {
 	void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+	u32 user_size = kattr->test.data_size_in;
 	void *data;
 
 	if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
 		return ERR_PTR(-EINVAL);
 
+	if (user_size > size)
+		return ERR_PTR(-EMSGSIZE);
+
 	data = kzalloc(size + headroom + tailroom, GFP_USER);
 	if (!data)
 		return ERR_PTR(-ENOMEM);
 
-	if (copy_from_user(data + headroom, data_in, size)) {
+	if (copy_from_user(data + headroom, data_in, user_size)) {
 		kfree(data);
 		return ERR_PTR(-EFAULT);
 	}
@@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	/* XDP have extra tailroom as (most) drivers use full page */
 	max_data_sz = 4096 - headroom - tailroom;
-	if (size > max_data_sz)
-		return -EINVAL;
 
 	data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
 	if (IS_ERR(data))
-- 
cgit v1.2.3


From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 19 May 2020 00:45:45 +0200
Subject: bpf: Add get{peer, sock}name attach types for sock_addr

As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective
for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be
transparent to applications. In Cilium we make use of these hooks [0] in
order to enable E-W load balancing for existing Kubernetes service types
for all Cilium managed nodes in the cluster. Those backends can be local
or remote. The main advantage of this approach is that it operates as close
as possible to the socket, and therefore allows to avoid packet-based NAT
given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses.

This also allows to expose NodePort services on loopback addresses in the
host namespace, for example. As another advantage, this also efficiently
blocks bind requests for applications in the host namespace for exposed
ports. However, one missing item is that we also need to perform reverse
xlation for inet{,6}_getname() hooks such that we can return the service
IP/port tuple back to the application instead of the remote peer address.

The vast majority of applications does not bother about getpeername(), but
in a few occasions we've seen breakage when validating the peer's address
since it returns unexpectedly the backend tuple instead of the service one.
Therefore, this trivial patch allows to customise and adds a getpeername()
as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order
to address this situation.

Simple example:

  # ./cilium/cilium service list
  ID   Frontend     Service Type   Backend
  1    1.2.3.4:80   ClusterIP      1 => 10.0.0.10:80

Before; curl's verbose output example, no getpeername() reverse xlation:

  # curl --verbose 1.2.3.4
  * Rebuilt URL to: 1.2.3.4/
  *   Trying 1.2.3.4...
  * TCP_NODELAY set
  * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0)
  > GET / HTTP/1.1
  > Host: 1.2.3.4
  > User-Agent: curl/7.58.0
  > Accept: */*
  [...]

After; with getpeername() reverse xlation:

  # curl --verbose 1.2.3.4
  * Rebuilt URL to: 1.2.3.4/
  *   Trying 1.2.3.4...
  * TCP_NODELAY set
  * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0)
  > GET / HTTP/1.1
  >  Host: 1.2.3.4
  > User-Agent: curl/7.58.0
  > Accept: */*
  [...]

Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed
peer to the context similar as in inet{,6}_getname() fashion, but API-wise
this is suboptimal as it always enforces programs having to test for ctx->peer
which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split.
Similarly, the checked return code is on tnum_range(1, 1), but if a use case
comes up in future, it can easily be changed to return an error code instead.
Helper and ctx member access is the same as with connect/sendmsg/etc hooks.

  [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Andrey Ignatov <rdna@fb.com>
Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net
---
 include/linux/bpf-cgroup.h     |  1 +
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/syscall.c           | 12 ++++++++++++
 kernel/bpf/verifier.c          |  6 +++++-
 net/core/filter.c              |  4 ++++
 net/ipv4/af_inet.c             |  8 ++++++--
 net/ipv6/af_inet6.c            |  9 ++++++---
 tools/include/uapi/linux/bpf.h |  4 ++++
 8 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 272626cc3fc9..c66c545e161a 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9b8a0f63b91..97e1fd19ff58 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -220,6 +220,10 @@ enum bpf_attach_type {
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
 	BPF_TRACE_ITER,
+	BPF_CGROUP_INET4_GETPEERNAME,
+	BPF_CGROUP_INET6_GETPEERNAME,
+	BPF_CGROUP_INET4_GETSOCKNAME,
+	BPF_CGROUP_INET6_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 57dfc98289d5..431241c74614 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_INET4_GETPEERNAME:
+		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_INET4_GETSOCKNAME:
+		case BPF_CGROUP_INET6_GETSOCKNAME:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
 		case BPF_CGROUP_UDP4_RECVMSG:
@@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_INET4_GETPEERNAME:
+	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_INET4_GETSOCKNAME:
+	case BPF_CGROUP_INET6_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
@@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_INET4_GETPEERNAME:
+	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_INET4_GETSOCKNAME:
+	case BPF_CGROUP_INET6_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c7d67d65d8c..2ed8351f47a4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env)
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
-		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
 			range = tnum_range(1, 1);
 		break;
 	case BPF_PROG_TYPE_CGROUP_SKB:
diff --git a/net/core/filter.c b/net/core/filter.c
index 822d662f97ef..bd2853d23b50 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_INET4_GETPEERNAME:
+		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP4_RECVMSG:
 			break;
@@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_INET6_GETSOCKNAME:
 		case BPF_CGROUP_UDP6_SENDMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
 			break;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fcf0d12a407a..8f5c8c9409d3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -755,12 +755,11 @@ do_err:
 }
 EXPORT_SYMBOL(inet_accept);
 
-
 /*
  *	This does both peername and sockname.
  */
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
-			int peer)
+		 int peer)
 {
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
@@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
 	}
+	if (cgroup_bpf_enabled)
+		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+					    peer ? BPF_CGROUP_INET4_GETPEERNAME :
+						   BPF_CGROUP_INET4_GETSOCKNAME,
+					    NULL);
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 	return sizeof(*sin);
 }
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 771a462a8322..3b6fcc0c321a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
 /*
  *	This does both peername and sockname.
  */
-
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
-		 int peer)
+		  int peer)
 {
 	struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
 	struct sock *sk = sock->sk;
@@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 			sin->sin6_addr = np->saddr;
 		else
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
 		sin->sin6_port = inet->inet_sport;
 	}
+	if (cgroup_bpf_enabled)
+		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+					    peer ? BPF_CGROUP_INET6_GETPEERNAME :
+						   BPF_CGROUP_INET6_GETSOCKNAME,
+					    NULL);
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
 	return sizeof(*sin);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 146c742f1d49..1cddc398404a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -220,6 +220,10 @@ enum bpf_attach_type {
 	BPF_MODIFY_RETURN,
 	BPF_LSM_MAC,
 	BPF_TRACE_ITER,
+	BPF_CGROUP_INET4_GETPEERNAME,
+	BPF_CGROUP_INET6_GETPEERNAME,
+	BPF_CGROUP_INET4_GETSOCKNAME,
+	BPF_CGROUP_INET6_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From 4b32f86bf1673acb16441dd55d7b325609f54897 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 19 May 2020 21:10:08 +0200
Subject: net/iucv: remove pm support

commit 394216275c7d ("s390: remove broken hibernate / power management support")
removed support for ARCH_HIBERNATION_POSSIBLE from s390.

So drop the unused pm ops from the s390-only iucv bus driver.

CC: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/iucv.c | 188 --------------------------------------------------------
 1 file changed, 188 deletions(-)

(limited to 'net')

diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 9a2d023842fe..19250a0c85d3 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -67,32 +67,9 @@ static int iucv_bus_match(struct device *dev, struct device_driver *drv)
 	return 0;
 }
 
-enum iucv_pm_states {
-	IUCV_PM_INITIAL = 0,
-	IUCV_PM_FREEZING = 1,
-	IUCV_PM_THAWING = 2,
-	IUCV_PM_RESTORING = 3,
-};
-static enum iucv_pm_states iucv_pm_state;
-
-static int iucv_pm_prepare(struct device *);
-static void iucv_pm_complete(struct device *);
-static int iucv_pm_freeze(struct device *);
-static int iucv_pm_thaw(struct device *);
-static int iucv_pm_restore(struct device *);
-
-static const struct dev_pm_ops iucv_pm_ops = {
-	.prepare = iucv_pm_prepare,
-	.complete = iucv_pm_complete,
-	.freeze = iucv_pm_freeze,
-	.thaw = iucv_pm_thaw,
-	.restore = iucv_pm_restore,
-};
-
 struct bus_type iucv_bus = {
 	.name = "iucv",
 	.match = iucv_bus_match,
-	.pm = &iucv_pm_ops,
 };
 EXPORT_SYMBOL(iucv_bus);
 
@@ -434,31 +411,6 @@ static void iucv_block_cpu(void *data)
 	cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
 }
 
-/**
- * iucv_block_cpu_almost
- * @data: unused
- *
- * Allow connection-severed interrupts only on this cpu.
- */
-static void iucv_block_cpu_almost(void *data)
-{
-	int cpu = smp_processor_id();
-	union iucv_param *parm;
-
-	/* Allow iucv control interrupts only */
-	parm = iucv_param_irq[cpu];
-	memset(parm, 0, sizeof(union iucv_param));
-	parm->set_mask.ipmask = 0x08;
-	iucv_call_b2f0(IUCV_SETMASK, parm);
-	/* Allow iucv-severed interrupt only */
-	memset(parm, 0, sizeof(union iucv_param));
-	parm->set_mask.ipmask = 0x20;
-	iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
-
-	/* Clear indication that iucv interrupts are allowed for this cpu. */
-	cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
-}
-
 /**
  * iucv_declare_cpu
  * @data: unused
@@ -1834,146 +1786,6 @@ static void iucv_external_interrupt(struct ext_code ext_code,
 	spin_unlock(&iucv_queue_lock);
 }
 
-static int iucv_pm_prepare(struct device *dev)
-{
-	int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_INFO "iucv_pm_prepare\n");
-#endif
-	if (dev->driver && dev->driver->pm && dev->driver->pm->prepare)
-		rc = dev->driver->pm->prepare(dev);
-	return rc;
-}
-
-static void iucv_pm_complete(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_INFO "iucv_pm_complete\n");
-#endif
-	if (dev->driver && dev->driver->pm && dev->driver->pm->complete)
-		dev->driver->pm->complete(dev);
-}
-
-/**
- * iucv_path_table_empty() - determine if iucv path table is empty
- *
- * Returns 0 if there are still iucv pathes defined
- *	   1 if there are no iucv pathes defined
- */
-static int iucv_path_table_empty(void)
-{
-	int i;
-
-	for (i = 0; i < iucv_max_pathid; i++) {
-		if (iucv_path_table[i])
-			return 0;
-	}
-	return 1;
-}
-
-/**
- * iucv_pm_freeze() - Freeze PM callback
- * @dev:	iucv-based device
- *
- * disable iucv interrupts
- * invoke callback function of the iucv-based driver
- * shut down iucv, if no iucv-pathes are established anymore
- */
-static int iucv_pm_freeze(struct device *dev)
-{
-	int cpu;
-	struct iucv_irq_list *p, *n;
-	int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "iucv_pm_freeze\n");
-#endif
-	if (iucv_pm_state != IUCV_PM_FREEZING) {
-		for_each_cpu(cpu, &iucv_irq_cpumask)
-			smp_call_function_single(cpu, iucv_block_cpu_almost,
-						 NULL, 1);
-		cancel_work_sync(&iucv_work);
-		list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
-			list_del_init(&p->list);
-			iucv_sever_pathid(p->data.ippathid,
-					  iucv_error_no_listener);
-			kfree(p);
-		}
-	}
-	iucv_pm_state = IUCV_PM_FREEZING;
-	if (dev->driver && dev->driver->pm && dev->driver->pm->freeze)
-		rc = dev->driver->pm->freeze(dev);
-	if (iucv_path_table_empty())
-		iucv_disable();
-	return rc;
-}
-
-/**
- * iucv_pm_thaw() - Thaw PM callback
- * @dev:	iucv-based device
- *
- * make iucv ready for use again: allocate path table, declare interrupt buffers
- *				  and enable iucv interrupts
- * invoke callback function of the iucv-based driver
- */
-static int iucv_pm_thaw(struct device *dev)
-{
-	int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "iucv_pm_thaw\n");
-#endif
-	iucv_pm_state = IUCV_PM_THAWING;
-	if (!iucv_path_table) {
-		rc = iucv_enable();
-		if (rc)
-			goto out;
-	}
-	if (cpumask_empty(&iucv_irq_cpumask)) {
-		if (iucv_nonsmp_handler)
-			/* enable interrupts on one cpu */
-			iucv_allow_cpu(NULL);
-		else
-			/* enable interrupts on all cpus */
-			iucv_setmask_mp();
-	}
-	if (dev->driver && dev->driver->pm && dev->driver->pm->thaw)
-		rc = dev->driver->pm->thaw(dev);
-out:
-	return rc;
-}
-
-/**
- * iucv_pm_restore() - Restore PM callback
- * @dev:	iucv-based device
- *
- * make iucv ready for use again: allocate path table, declare interrupt buffers
- *				  and enable iucv interrupts
- * invoke callback function of the iucv-based driver
- */
-static int iucv_pm_restore(struct device *dev)
-{
-	int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table);
-#endif
-	if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table)
-		pr_warn("Suspending Linux did not completely close all IUCV connections\n");
-	iucv_pm_state = IUCV_PM_RESTORING;
-	if (cpumask_empty(&iucv_irq_cpumask)) {
-		rc = iucv_query_maxconn();
-		rc = iucv_enable();
-		if (rc)
-			goto out;
-	}
-	if (dev->driver && dev->driver->pm && dev->driver->pm->restore)
-		rc = dev->driver->pm->restore(dev);
-out:
-	return rc;
-}
-
 struct iucv_interface iucv_if = {
 	.message_receive = iucv_message_receive,
 	.__message_receive = __iucv_message_receive,
-- 
cgit v1.2.3


From 585bc2209539501674b77de68f58e98039892501 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 19 May 2020 21:10:09 +0200
Subject: net/af_iucv: remove pm support

commit 394216275c7d ("s390: remove broken hibernate / power management support")
removed support for ARCH_HIBERNATION_POSSIBLE from s390.

So drop the unused pm ops from the s390-only af_iucv socket code.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 141 +----------------------------------------------------
 1 file changed, 1 insertion(+), 140 deletions(-)

(limited to 'net')

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index c4bdcbc84b07..84bd18fea1d6 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -87,7 +87,6 @@ do {									\
 
 static void iucv_sock_kill(struct sock *sk);
 static void iucv_sock_close(struct sock *sk);
-static void iucv_sever_path(struct sock *, int);
 
 static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct packet_type *pt, struct net_device *orig_dev);
@@ -127,110 +126,6 @@ static inline void low_nmcpy(unsigned char *dst, char *src)
        memcpy(&dst[8], src, 8);
 }
 
-static int afiucv_pm_prepare(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "afiucv_pm_prepare\n");
-#endif
-	return 0;
-}
-
-static void afiucv_pm_complete(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "afiucv_pm_complete\n");
-#endif
-}
-
-/**
- * afiucv_pm_freeze() - Freeze PM callback
- * @dev:	AFIUCV dummy device
- *
- * Sever all established IUCV communication pathes
- */
-static int afiucv_pm_freeze(struct device *dev)
-{
-	struct iucv_sock *iucv;
-	struct sock *sk;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "afiucv_pm_freeze\n");
-#endif
-	read_lock(&iucv_sk_list.lock);
-	sk_for_each(sk, &iucv_sk_list.head) {
-		iucv = iucv_sk(sk);
-		switch (sk->sk_state) {
-		case IUCV_DISCONN:
-		case IUCV_CLOSING:
-		case IUCV_CONNECTED:
-			iucv_sever_path(sk, 0);
-			break;
-		case IUCV_OPEN:
-		case IUCV_BOUND:
-		case IUCV_LISTEN:
-		case IUCV_CLOSED:
-		default:
-			break;
-		}
-		skb_queue_purge(&iucv->send_skb_q);
-		skb_queue_purge(&iucv->backlog_skb_q);
-	}
-	read_unlock(&iucv_sk_list.lock);
-	return 0;
-}
-
-/**
- * afiucv_pm_restore_thaw() - Thaw and restore PM callback
- * @dev:	AFIUCV dummy device
- *
- * socket clean up after freeze
- */
-static int afiucv_pm_restore_thaw(struct device *dev)
-{
-	struct sock *sk;
-
-#ifdef CONFIG_PM_DEBUG
-	printk(KERN_WARNING "afiucv_pm_restore_thaw\n");
-#endif
-	read_lock(&iucv_sk_list.lock);
-	sk_for_each(sk, &iucv_sk_list.head) {
-		switch (sk->sk_state) {
-		case IUCV_CONNECTED:
-			sk->sk_err = EPIPE;
-			sk->sk_state = IUCV_DISCONN;
-			sk->sk_state_change(sk);
-			break;
-		case IUCV_DISCONN:
-		case IUCV_CLOSING:
-		case IUCV_LISTEN:
-		case IUCV_BOUND:
-		case IUCV_OPEN:
-		default:
-			break;
-		}
-	}
-	read_unlock(&iucv_sk_list.lock);
-	return 0;
-}
-
-static const struct dev_pm_ops afiucv_pm_ops = {
-	.prepare = afiucv_pm_prepare,
-	.complete = afiucv_pm_complete,
-	.freeze = afiucv_pm_freeze,
-	.thaw = afiucv_pm_restore_thaw,
-	.restore = afiucv_pm_restore_thaw,
-};
-
-static struct device_driver af_iucv_driver = {
-	.owner = THIS_MODULE,
-	.name = "afiucv",
-	.bus  = NULL,
-	.pm   = &afiucv_pm_ops,
-};
-
-/* dummy device used as trigger for PM functions */
-static struct device *af_iucv_dev;
-
 /**
  * iucv_msg_length() - Returns the length of an iucv message.
  * @msg:	Pointer to struct iucv_message, MUST NOT be NULL
@@ -2409,45 +2304,11 @@ static struct packet_type iucv_packet_type = {
 
 static int afiucv_iucv_init(void)
 {
-	int err;
-
-	err = pr_iucv->iucv_register(&af_iucv_handler, 0);
-	if (err)
-		goto out;
-	/* establish dummy device */
-	af_iucv_driver.bus = pr_iucv->bus;
-	err = driver_register(&af_iucv_driver);
-	if (err)
-		goto out_iucv;
-	af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL);
-	if (!af_iucv_dev) {
-		err = -ENOMEM;
-		goto out_driver;
-	}
-	dev_set_name(af_iucv_dev, "af_iucv");
-	af_iucv_dev->bus = pr_iucv->bus;
-	af_iucv_dev->parent = pr_iucv->root;
-	af_iucv_dev->release = (void (*)(struct device *))kfree;
-	af_iucv_dev->driver = &af_iucv_driver;
-	err = device_register(af_iucv_dev);
-	if (err)
-		goto out_iucv_dev;
-	return 0;
-
-out_iucv_dev:
-	put_device(af_iucv_dev);
-out_driver:
-	driver_unregister(&af_iucv_driver);
-out_iucv:
-	pr_iucv->iucv_unregister(&af_iucv_handler, 0);
-out:
-	return err;
+	return pr_iucv->iucv_register(&af_iucv_handler, 0);
 }
 
 static void afiucv_iucv_exit(void)
 {
-	device_unregister(af_iucv_dev);
-	driver_unregister(&af_iucv_driver);
 	pr_iucv->iucv_unregister(&af_iucv_handler, 0);
 }
 
-- 
cgit v1.2.3


From 0d1c7664ed2044048d5ed790b943affa296bc3a4 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 19 May 2020 21:10:10 +0200
Subject: net/af_iucv: replace open-coded U16_MAX

Improve the readability of a range check.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 84bd18fea1d6..b02470a04c50 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/types.h>
+#include <linux/limits.h>
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -1559,7 +1560,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
 		switch (sk->sk_state) {
 		case IUCV_OPEN:
 		case IUCV_BOUND:
-			if (val < 1 || val > (u16)(~0))
+			if (val < 1 || val > U16_MAX)
 				rc = -EINVAL;
 			else
 				iucv->msglimit = val;
-- 
cgit v1.2.3


From dca1262f97499337489da7f5aa0bc6fec247a83f Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 19 May 2020 21:10:11 +0200
Subject: net/af_iucv: remove a redundant zero initialization

txmsg is declared as {0}, no need to clear individual fields later on.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index b02470a04c50..799dcf5483de 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -996,7 +996,6 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	/* initialize defaults */
 	cmsg_done   = 0;	/* check for duplicate headers */
-	txmsg.class = 0;
 
 	/* iterate over control messages */
 	for_each_cmsghdr(cmsg, msg) {
-- 
cgit v1.2.3


From e9a36ca5f6f302675e7e36101ffa0ca7f9b8779b Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 19 May 2020 21:10:12 +0200
Subject: net/af_iucv: clean up function prototypes

Remove a bunch of forward declarations (trivially shifting code around
where needed), and make a few functions static.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/iucv/af_iucv.h |   8 ----
 net/iucv/af_iucv.c         | 108 +++++++++++++++++++++------------------------
 2 files changed, 51 insertions(+), 65 deletions(-)

(limited to 'net')

diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index 14a490246be9..9259ce2b22f3 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -158,12 +158,4 @@ struct iucv_sock_list {
 	atomic_t	  autobind_name;
 };
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait);
-void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
-void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
-void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
-void iucv_accept_unlink(struct sock *sk);
-struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock);
-
 #endif /* __IUCV_H */
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 799dcf5483de..ee0add15497d 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -37,8 +37,6 @@
 
 static char iucv_userid[80];
 
-static const struct proto_ops iucv_sock_ops;
-
 static struct proto iucv_proto = {
 	.name		= "AF_IUCV",
 	.owner		= THIS_MODULE,
@@ -86,13 +84,11 @@ do {									\
 	__ret;								\
 })
 
+static struct sock *iucv_accept_dequeue(struct sock *parent,
+					struct socket *newsock);
 static void iucv_sock_kill(struct sock *sk);
 static void iucv_sock_close(struct sock *sk);
 
-static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
-	struct packet_type *pt, struct net_device *orig_dev);
-static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
-		   struct sk_buff *skb, u8 flags);
 static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify);
 
 /* Call Back functions */
@@ -331,6 +327,20 @@ static void iucv_sock_cleanup_listen(struct sock *parent)
 	parent->sk_state = IUCV_CLOSED;
 }
 
+static void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_add_node(sk, &l->head);
+	write_unlock_bh(&l->lock);
+}
+
+static void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&l->lock);
+}
+
 /* Kill socket (only if zapped and orphaned) */
 static void iucv_sock_kill(struct sock *sk)
 {
@@ -503,53 +513,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio,
 	return sk;
 }
 
-/* Create an IUCV socket */
-static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
-			    int kern)
-{
-	struct sock *sk;
-
-	if (protocol && protocol != PF_IUCV)
-		return -EPROTONOSUPPORT;
-
-	sock->state = SS_UNCONNECTED;
-
-	switch (sock->type) {
-	case SOCK_STREAM:
-		sock->ops = &iucv_sock_ops;
-		break;
-	case SOCK_SEQPACKET:
-		/* currently, proto ops can handle both sk types */
-		sock->ops = &iucv_sock_ops;
-		break;
-	default:
-		return -ESOCKTNOSUPPORT;
-	}
-
-	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
-	if (!sk)
-		return -ENOMEM;
-
-	iucv_sock_init(sk, NULL);
-
-	return 0;
-}
-
-void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
-{
-	write_lock_bh(&l->lock);
-	sk_add_node(sk, &l->head);
-	write_unlock_bh(&l->lock);
-}
-
-void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
-{
-	write_lock_bh(&l->lock);
-	sk_del_node_init(sk);
-	write_unlock_bh(&l->lock);
-}
-
-void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
+static void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
 {
 	unsigned long flags;
 	struct iucv_sock *par = iucv_sk(parent);
@@ -562,7 +526,7 @@ void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
 	sk_acceptq_added(parent);
 }
 
-void iucv_accept_unlink(struct sock *sk)
+static void iucv_accept_unlink(struct sock *sk)
 {
 	unsigned long flags;
 	struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent);
@@ -575,7 +539,8 @@ void iucv_accept_unlink(struct sock *sk)
 	sock_put(sk);
 }
 
-struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock)
+static struct sock *iucv_accept_dequeue(struct sock *parent,
+					struct socket *newsock)
 {
 	struct iucv_sock *isk, *n;
 	struct sock *sk;
@@ -1406,8 +1371,8 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
-			    poll_table *wait)
+static __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
@@ -2291,6 +2256,35 @@ static const struct proto_ops iucv_sock_ops = {
 	.getsockopt	= iucv_sock_getsockopt,
 };
 
+static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+
+	if (protocol && protocol != PF_IUCV)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+		/* currently, proto ops can handle both sk types */
+		sock->ops = &iucv_sock_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	iucv_sock_init(sk, NULL);
+
+	return 0;
+}
+
 static const struct net_proto_family iucv_sock_family_ops = {
 	.family	= AF_IUCV,
 	.owner	= THIS_MODULE,
-- 
cgit v1.2.3


From c384b8a70c59d85f83a05c8963e71d35da2607b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:11 +0200
Subject: ipv4: streamline ipmr_new_tunnel

Reduce a few level of indentation to simplify the function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 73 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5c218db2dede..a1169b694113 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -471,50 +471,49 @@ static bool ipmr_init_vif_indev(const struct net_device *dev)
 
 static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 {
-	struct net_device  *dev;
+	struct net_device *tunnel_dev, *new_dev;
+	struct ip_tunnel_parm p = { };
+	mm_segment_t oldfs = get_fs();
+	struct ifreq ifr;
+	int err;
 
-	dev = __dev_get_by_name(net, "tunl0");
+	tunnel_dev = __dev_get_by_name(net, "tunl0");
+	if (!tunnel_dev)
+		goto out;
 
-	if (dev) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-		int err;
-		struct ifreq ifr;
-		struct ip_tunnel_parm p;
+	p.iph.daddr = v->vifc_rmt_addr.s_addr;
+	p.iph.saddr = v->vifc_lcl_addr.s_addr;
+	p.iph.version = 4;
+	p.iph.ihl = 5;
+	p.iph.protocol = IPPROTO_IPIP;
+	sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+	ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 
-		memset(&p, 0, sizeof(p));
-		p.iph.daddr = v->vifc_rmt_addr.s_addr;
-		p.iph.saddr = v->vifc_lcl_addr.s_addr;
-		p.iph.version = 4;
-		p.iph.ihl = 5;
-		p.iph.protocol = IPPROTO_IPIP;
-		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
-		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+	if (!tunnel_dev->netdev_ops->ndo_do_ioctl)
+		goto out;
 
-		if (ops->ndo_do_ioctl) {
-			mm_segment_t oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr,
+			SIOCADDTUNNEL);
+	set_fs(oldfs);
+	if (err)
+		goto out;
 
-			set_fs(KERNEL_DS);
-			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
-			set_fs(oldfs);
-		} else {
-			err = -EOPNOTSUPP;
-		}
-		dev = NULL;
+	new_dev = __dev_get_by_name(net, p.name);
+	if (!new_dev)
+		goto out;
 
-		if (err == 0 &&
-		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
-			dev->flags |= IFF_MULTICAST;
-			if (!ipmr_init_vif_indev(dev))
-				goto failure;
-			if (dev_open(dev, NULL))
-				goto failure;
-			dev_hold(dev);
-		}
-	}
-	return dev;
+	new_dev->flags |= IFF_MULTICAST;
+	if (!ipmr_init_vif_indev(new_dev))
+		goto out_unregister;
+	if (dev_open(new_dev, NULL))
+		goto out_unregister;
+	dev_hold(new_dev);
+	return new_dev;
 
-failure:
-	unregister_netdevice(dev);
+out_unregister:
+	unregister_netdevice(new_dev);
+out:
 	return NULL;
 }
 
-- 
cgit v1.2.3


From c1fd1182c43692022a7938d6b496fa21fcd49717 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:12 +0200
Subject: ipv4: consolidate the VIFF_TUNNEL handling in ipmr_new_tunnel

Also move the dev_set_allmulti call and the error handling into the
ioctl helper.  This allows reusing already looked up tunnel_dev pointer
and the set up argument structure for the deletion in the error handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 53 +++++++++++++----------------------------------------
 1 file changed, 13 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a1169b694113..cd1a3260a99a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -421,37 +421,6 @@ static void ipmr_free_table(struct mr_table *mrt)
 
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
-static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
-{
-	struct net *net = dev_net(dev);
-
-	dev_close(dev);
-
-	dev = __dev_get_by_name(net, "tunl0");
-	if (dev) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-		struct ifreq ifr;
-		struct ip_tunnel_parm p;
-
-		memset(&p, 0, sizeof(p));
-		p.iph.daddr = v->vifc_rmt_addr.s_addr;
-		p.iph.saddr = v->vifc_lcl_addr.s_addr;
-		p.iph.version = 4;
-		p.iph.ihl = 5;
-		p.iph.protocol = IPPROTO_IPIP;
-		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
-		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
-
-		if (ops->ndo_do_ioctl) {
-			mm_segment_t oldfs = get_fs();
-
-			set_fs(KERNEL_DS);
-			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
-			set_fs(oldfs);
-		}
-	}
-}
-
 /* Initialize ipmr pimreg/tunnel in_device */
 static bool ipmr_init_vif_indev(const struct net_device *dev)
 {
@@ -509,12 +478,22 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 	if (dev_open(new_dev, NULL))
 		goto out_unregister;
 	dev_hold(new_dev);
+	err = dev_set_allmulti(new_dev, 1);
+	if (err) {
+		dev_close(new_dev);
+		set_fs(KERNEL_DS);
+		tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr,
+				SIOCDELTUNNEL);
+		set_fs(oldfs);
+		dev_put(new_dev);
+		new_dev = ERR_PTR(err);
+	}
 	return new_dev;
 
 out_unregister:
 	unregister_netdevice(new_dev);
 out:
-	return NULL;
+	return ERR_PTR(-ENOBUFS);
 }
 
 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
@@ -866,14 +845,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		break;
 	case VIFF_TUNNEL:
 		dev = ipmr_new_tunnel(net, vifc);
-		if (!dev)
-			return -ENOBUFS;
-		err = dev_set_allmulti(dev, 1);
-		if (err) {
-			ipmr_del_tunnel(dev, vifc);
-			dev_put(dev);
-			return err;
-		}
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
 		break;
 	case VIFF_USE_IFINDEX:
 	case 0:
-- 
cgit v1.2.3


From 607259a695312cdfac2b52fb9d5b5890c834d573 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:13 +0200
Subject: net: add a new ndo_tunnel_ioctl method

This method is used to properly allow kernel callers of the IPv4 route
management ioctls.  The exsting ip_tunnel_ioctl helper is renamed to
ip_tunnel_ctl to better reflect that it doesn't directly implement ioctls
touching user memory, and is used for the guts of ndo_tunnel_ctl
implementations. A new ip_tunnel_ioctl helper is added that can be wired
up directly to the ndo_do_ioctl method and takes care of the copy to and
from userspace.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++++
 include/net/ip_tunnels.h  |  3 ++-
 net/ipv4/ip_gre.c         | 35 ++++++++++++++---------------------
 net/ipv4/ip_tunnel.c      | 16 +++++++++++++++-
 net/ipv4/ip_vti.c         | 32 +++++++++++++-------------------
 net/ipv4/ipip.c           | 30 +++++++++---------------------
 6 files changed, 59 insertions(+), 63 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a8f8daef09d..a18f8fdf4260 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -53,6 +53,7 @@ struct netpoll_info;
 struct device;
 struct phy_device;
 struct dsa_port;
+struct ip_tunnel_parm;
 struct macsec_context;
 struct macsec_ops;
 
@@ -1274,6 +1275,9 @@ struct netdev_net_notifier {
  *	Get devlink port instance associated with a given netdev.
  *	Called with a reference on the netdevice and devlink locks only,
  *	rtnl_lock is not held.
+ * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
+ *			 int cmd);
+ *	Add, change, delete or get information on an IPv4 tunnel.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1479,6 +1483,8 @@ struct net_device_ops {
 	int			(*ndo_xsk_wakeup)(struct net_device *dev,
 						  u32 queue_id, u32 flags);
 	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
+	int			(*ndo_tunnel_ctl)(struct net_device *dev,
+						  struct ip_tunnel_parm *p, int cmd);
 };
 
 /**
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 236503a50759..076e5d7db7d3 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -269,7 +269,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		       const u8 proto, int tunnel_hlen);
-int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
+int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0ce9b91ff55c..4e31f23e4117 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -768,45 +768,37 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 	}
 }
 
-static int ipgre_tunnel_ioctl(struct net_device *dev,
-			      struct ifreq *ifr, int cmd)
+static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
+			    int cmd)
 {
-	struct ip_tunnel_parm p;
 	int err;
 
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-		return -EFAULT;
-
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
-		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
-		    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
-		    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
+		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
+		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
+		    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
 			return -EINVAL;
 	}
 
-	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
-	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
+	p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
+	p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
 
-	err = ip_tunnel_ioctl(dev, &p, cmd);
+	err = ip_tunnel_ctl(dev, p, cmd);
 	if (err)
 		return err;
 
 	if (cmd == SIOCCHGTUNNEL) {
 		struct ip_tunnel *t = netdev_priv(dev);
 
-		t->parms.i_flags = p.i_flags;
-		t->parms.o_flags = p.o_flags;
+		t->parms.i_flags = p->i_flags;
+		t->parms.o_flags = p->o_flags;
 
 		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 			ipgre_link_update(dev, true);
 	}
 
-	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
-	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
-
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-		return -EFAULT;
-
+	p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+	p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
 	return 0;
 }
 
@@ -924,10 +916,11 @@ static const struct net_device_ops ipgre_netdev_ops = {
 	.ndo_stop		= ipgre_close,
 #endif
 	.ndo_start_xmit		= ipgre_xmit,
-	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
+	.ndo_do_ioctl		= ip_tunnel_ioctl,
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
 	.ndo_get_stats64	= ip_tunnel_get_stats64,
 	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl		= ipgre_tunnel_ctl,
 };
 
 #define GRE_FEATURES (NETIF_F_SG |		\
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index cd4b84310d92..f4f1d11eab50 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -860,7 +860,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 	netdev_state_change(dev);
 }
 
-int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 {
 	int err = 0;
 	struct ip_tunnel *t = netdev_priv(dev);
@@ -960,6 +960,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 done:
 	return err;
 }
+EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct ip_tunnel_parm p;
+	int err;
+
+	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		return -EFAULT;
+	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
+	if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+		return -EFAULT;
+	return err;
+}
 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1b4e6f298648..c8974360a99f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -378,38 +378,31 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 }
 
 static int
-vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 {
 	int err = 0;
-	struct ip_tunnel_parm p;
-
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-		return -EFAULT;
 
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
-		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
-		    p.iph.ihl != 5)
+		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP ||
+		    p->iph.ihl != 5)
 			return -EINVAL;
 	}
 
-	if (!(p.i_flags & GRE_KEY))
-		p.i_key = 0;
-	if (!(p.o_flags & GRE_KEY))
-		p.o_key = 0;
+	if (!(p->i_flags & GRE_KEY))
+		p->i_key = 0;
+	if (!(p->o_flags & GRE_KEY))
+		p->o_key = 0;
 
-	p.i_flags = VTI_ISVTI;
+	p->i_flags = VTI_ISVTI;
 
-	err = ip_tunnel_ioctl(dev, &p, cmd);
+	err = ip_tunnel_ctl(dev, p, cmd);
 	if (err)
 		return err;
 
 	if (cmd != SIOCDELTUNNEL) {
-		p.i_flags |= GRE_KEY;
-		p.o_flags |= GRE_KEY;
+		p->i_flags |= GRE_KEY;
+		p->o_flags |= GRE_KEY;
 	}
-
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-		return -EFAULT;
 	return 0;
 }
 
@@ -417,10 +410,11 @@ static const struct net_device_ops vti_netdev_ops = {
 	.ndo_init	= vti_tunnel_init,
 	.ndo_uninit	= ip_tunnel_uninit,
 	.ndo_start_xmit	= vti_tunnel_xmit,
-	.ndo_do_ioctl	= vti_tunnel_ioctl,
+	.ndo_do_ioctl	= ip_tunnel_ioctl,
 	.ndo_change_mtu	= ip_tunnel_change_mtu,
 	.ndo_get_stats64 = ip_tunnel_get_stats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl	= vti_tunnel_ctl,
 };
 
 static void vti_tunnel_setup(struct net_device *dev)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2f01cf6fa0de..df663baf2516 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,41 +327,29 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
 }
 
 static int
-ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 {
-	int err = 0;
-	struct ip_tunnel_parm p;
-
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-		return -EFAULT;
-
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
-		if (p.iph.version != 4 ||
-		    !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) ||
-		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+		if (p->iph.version != 4 ||
+		    !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) ||
+		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
 			return -EINVAL;
 	}
 
-	p.i_key = p.o_key = 0;
-	p.i_flags = p.o_flags = 0;
-	err = ip_tunnel_ioctl(dev, &p, cmd);
-	if (err)
-		return err;
-
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-		return -EFAULT;
-
-	return 0;
+	p->i_key = p->o_key = 0;
+	p->i_flags = p->o_flags = 0;
+	return ip_tunnel_ctl(dev, p, cmd);
 }
 
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
 	.ndo_start_xmit	= ipip_tunnel_xmit,
-	.ndo_do_ioctl	= ipip_tunnel_ioctl,
+	.ndo_do_ioctl	= ip_tunnel_ioctl,
 	.ndo_change_mtu = ip_tunnel_change_mtu,
 	.ndo_get_stats64 = ip_tunnel_get_stats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl	= ipip_tunnel_ctl,
 };
 
 #define IPIP_FEATURES (NETIF_F_SG |		\
-- 
cgit v1.2.3


From c7e3670516042bfd8147151d9008b5874a6eb73e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:14 +0200
Subject: impr: use ->ndo_tunnel_ctl in ipmr_new_tunnel

Use the new ->ndo_tunnel_ctl instead of overriding the address limit
and using ->ndo_do_ioctl just to do a pointless user copy.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index cd1a3260a99a..d3e9b80a57de 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -442,8 +442,6 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 {
 	struct net_device *tunnel_dev, *new_dev;
 	struct ip_tunnel_parm p = { };
-	mm_segment_t oldfs = get_fs();
-	struct ifreq ifr;
 	int err;
 
 	tunnel_dev = __dev_get_by_name(net, "tunl0");
@@ -456,15 +454,11 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 	p.iph.ihl = 5;
 	p.iph.protocol = IPPROTO_IPIP;
 	sprintf(p.name, "dvmrp%d", v->vifc_vifi);
-	ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 
-	if (!tunnel_dev->netdev_ops->ndo_do_ioctl)
+	if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl)
 		goto out;
-
-	set_fs(KERNEL_DS);
-	err = tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr,
+	err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
 			SIOCADDTUNNEL);
-	set_fs(oldfs);
 	if (err)
 		goto out;
 
@@ -481,10 +475,8 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 	err = dev_set_allmulti(new_dev, 1);
 	if (err) {
 		dev_close(new_dev);
-		set_fs(KERNEL_DS);
-		tunnel_dev->netdev_ops->ndo_do_ioctl(tunnel_dev, &ifr,
+		tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
 				SIOCDELTUNNEL);
-		set_fs(oldfs);
 		dev_put(new_dev);
 		new_dev = ERR_PTR(err);
 	}
-- 
cgit v1.2.3


From fd5d687b76b32543f6b2024ebe21f988ed1f8859 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:15 +0200
Subject: sit: refactor ipip6_tunnel_ioctl

Split the ioctl handler into one function per command instead of having
a all the logic sit in one giant switch statement.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 368 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 210 insertions(+), 158 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 98954830c40b..7c158fdc02da 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -83,6 +83,13 @@ struct sit_net {
 	struct net_device *fb_tunnel_dev;
 };
 
+static inline struct sit_net *dev_to_sit_net(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	return net_generic(t->net, sit_net_id);
+}
+
 /*
  * Must be invoked with rcu_read_lock
  */
@@ -291,14 +298,18 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
 
 }
 
-static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
-				struct ip_tunnel_prl __user *a)
+static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr)
 {
+	struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data;
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_prl kprl, *kp;
 	struct ip_tunnel_prl_entry *prl;
 	unsigned int cmax, c = 0, ca, len;
 	int ret = 0;
 
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+		return -EINVAL;
+
 	if (copy_from_user(&kprl, a, sizeof(kprl)))
 		return -EFAULT;
 	cmax = kprl.datalen / sizeof(kprl);
@@ -441,6 +452,35 @@ out:
 	return err;
 }
 
+static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr,
+		int cmd)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_prl prl;
+	int err;
+
+	if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+		return -EINVAL;
+
+	if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case SIOCDELPRL:
+		err = ipip6_tunnel_del_prl(t, &prl);
+		break;
+	case SIOCADDPRL:
+	case SIOCCHGPRL:
+		err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+		break;
+	}
+	dst_cache_reset(&t->dst_cache);
+	netdev_state_change(dev);
+	return err;
+}
+
 static int
 isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
 {
@@ -1151,7 +1191,53 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
 	netdev_state_change(t->dev);
 	return 0;
 }
-#endif
+
+static int
+ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_6rd ip6rd;
+	struct ip_tunnel_parm p;
+
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			return -EFAULT;
+		t = ipip6_tunnel_locate(t->net, &p, 0);
+	}
+	if (!t)
+		t = netdev_priv(dev);
+
+	ip6rd.prefix = t->ip6rd.prefix;
+	ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+	ip6rd.prefixlen = t->ip6rd.prefixlen;
+	ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+	if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_6rd ip6rd;
+	int err;
+
+	if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd)))
+		return -EFAULT;
+
+	if (cmd != SIOCDEL6RD) {
+		err = ipip6_tunnel_update_6rd(t, &ip6rd);
+		if (err < 0)
+			return err;
+	} else
+		ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev));
+	return 0;
+}
+
+#endif /* CONFIG_IPV6_SIT_6RD */
 
 static bool ipip6_valid_ip_proto(u8 ipproto)
 {
@@ -1164,185 +1250,151 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
 }
 
 static int
-ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p)
 {
-	int err = 0;
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!ipip6_valid_ip_proto(p->iph.protocol))
+		return -EINVAL;
+	if (p->iph.version != 4 ||
+	    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+		return -EINVAL;
+
+	if (p->iph.ttl)
+		p->iph.frag_off |= htons(IP_DF);
+	return 0;
+}
+
+static int
+ipip6_tunnel_get(struct net_device *dev, struct ifreq *ifr)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm p;
-	struct ip_tunnel_prl prl;
+
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			return -EFAULT;
+		t = ipip6_tunnel_locate(t->net, &p, 0);
+	}
+	if (!t)
+		t = netdev_priv(dev);
+
+	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+ipip6_tunnel_add(struct net_device *dev, struct ifreq *ifr)
+{
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct net *net = t->net;
-	struct sit_net *sitn = net_generic(net, sit_net_id);
-#ifdef CONFIG_IPV6_SIT_6RD
-	struct ip_tunnel_6rd ip6rd;
-#endif
+	struct ip_tunnel_parm p;
+	int err;
 
-	switch (cmd) {
-	case SIOCGETTUNNEL:
-#ifdef CONFIG_IPV6_SIT_6RD
-	case SIOCGET6RD:
-#endif
-		if (dev == sitn->fb_tunnel_dev) {
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
-				err = -EFAULT;
-				break;
-			}
-			t = ipip6_tunnel_locate(net, &p, 0);
-			if (!t)
-				t = netdev_priv(dev);
-		}
+	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		return -EFAULT;
+	err = __ipip6_tunnel_ioctl_validate(t->net, &p);
+	if (err)
+		return err;
 
-		err = -EFAULT;
-		if (cmd == SIOCGETTUNNEL) {
-			memcpy(&p, &t->parms, sizeof(p));
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
-					 sizeof(p)))
-				goto done;
-#ifdef CONFIG_IPV6_SIT_6RD
+	t = ipip6_tunnel_locate(t->net, &p, 1);
+	if (!t)
+		return -ENOBUFS;
+
+	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm p;
+	int err;
+
+	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+		return -EFAULT;
+	err = __ipip6_tunnel_ioctl_validate(t->net, &p);
+	if (err)
+		return err;
+
+	t = ipip6_tunnel_locate(t->net, &p, 0);
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+		if (!t)
+			return -ENOENT;
+	} else {
+		if (t) {
+			if (t->dev != dev)
+				return -EEXIST;
 		} else {
-			ip6rd.prefix = t->ip6rd.prefix;
-			ip6rd.relay_prefix = t->ip6rd.relay_prefix;
-			ip6rd.prefixlen = t->ip6rd.prefixlen;
-			ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
-					 sizeof(ip6rd)))
-				goto done;
-#endif
+			if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+			    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+				return -EINVAL;
+			t = netdev_priv(dev);
 		}
-		err = 0;
-		break;
 
-	case SIOCADDTUNNEL:
-	case SIOCCHGTUNNEL:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
+		ipip6_tunnel_update(t, &p, t->fwmark);
+	}
 
-		err = -EFAULT;
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-			goto done;
-
-		err = -EINVAL;
-		if (!ipip6_valid_ip_proto(p.iph.protocol))
-			goto done;
-		if (p.iph.version != 4 ||
-		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
-			goto done;
-		if (p.iph.ttl)
-			p.iph.frag_off |= htons(IP_DF);
-
-		t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
-		if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
-			if (t) {
-				if (t->dev != dev) {
-					err = -EEXIST;
-					break;
-				}
-			} else {
-				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
-				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
-					err = -EINVAL;
-					break;
-				}
-				t = netdev_priv(dev);
-			}
+	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+		return -EFAULT;
+	return 0;
+}
 
-			ipip6_tunnel_update(t, &p, t->fwmark);
-		}
+static int
+ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm p;
 
-		if (t) {
-			err = 0;
-			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
-				err = -EFAULT;
-		} else
-			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
-		break;
+	if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
 
-	case SIOCDELTUNNEL:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
-
-		if (dev == sitn->fb_tunnel_dev) {
-			err = -EFAULT;
-			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-				goto done;
-			err = -ENOENT;
-			t = ipip6_tunnel_locate(net, &p, 0);
-			if (!t)
-				goto done;
-			err = -EPERM;
-			if (t == netdev_priv(sitn->fb_tunnel_dev))
-				goto done;
-			dev = t->dev;
-		}
-		unregister_netdevice(dev);
-		err = 0;
-		break;
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			return -EFAULT;
+		t = ipip6_tunnel_locate(t->net, &p, 0);
+		if (!t)
+			return -ENOENT;
+		if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev))
+			return -EPERM;
+		dev = t->dev;
+	}
+	unregister_netdevice(dev);
+	return 0;
+}
 
+static int
+ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		return ipip6_tunnel_get(dev, ifr);
+	case SIOCADDTUNNEL:
+		return ipip6_tunnel_add(dev, ifr);
+	case SIOCCHGTUNNEL:
+		return ipip6_tunnel_change(dev, ifr);
+	case SIOCDELTUNNEL:
+		return ipip6_tunnel_del(dev, ifr);
 	case SIOCGETPRL:
-		err = -EINVAL;
-		if (dev == sitn->fb_tunnel_dev)
-			goto done;
-		err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
-		break;
-
+		return ipip6_tunnel_get_prl(dev, ifr);
 	case SIOCADDPRL:
 	case SIOCDELPRL:
 	case SIOCCHGPRL:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
-		err = -EINVAL;
-		if (dev == sitn->fb_tunnel_dev)
-			goto done;
-		err = -EFAULT;
-		if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
-			goto done;
-
-		switch (cmd) {
-		case SIOCDELPRL:
-			err = ipip6_tunnel_del_prl(t, &prl);
-			break;
-		case SIOCADDPRL:
-		case SIOCCHGPRL:
-			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
-			break;
-		}
-		dst_cache_reset(&t->dst_cache);
-		netdev_state_change(dev);
-		break;
-
+		return ipip6_tunnel_prl_ctl(dev, ifr, cmd);
 #ifdef CONFIG_IPV6_SIT_6RD
+	case SIOCGET6RD:
+		return ipip6_tunnel_get6rd(dev, ifr);
 	case SIOCADD6RD:
 	case SIOCCHG6RD:
 	case SIOCDEL6RD:
-		err = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto done;
-
-		err = -EFAULT;
-		if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
-				   sizeof(ip6rd)))
-			goto done;
-
-		if (cmd != SIOCDEL6RD) {
-			err = ipip6_tunnel_update_6rd(t, &ip6rd);
-			if (err < 0)
-				goto done;
-		} else
-			ipip6_tunnel_clone_6rd(dev, sitn);
-
-		err = 0;
-		break;
+		return ipip6_tunnel_6rdctl(dev, ifr, cmd);
 #endif
-
 	default:
-		err = -EINVAL;
+		return -EINVAL;
 	}
-
-done:
-	return err;
 }
 
 static const struct net_device_ops ipip6_netdev_ops = {
-- 
cgit v1.2.3


From f60fe2df931d6c0b41c3d4f8ec3f7a429f977a3f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:16 +0200
Subject: sit: impement ->ndo_tunnel_ctl

Implement the ->ndo_tunnel_ctl method, and use ip_tunnel_ioctl to
handle userspace requests for the SIOCGETTUNNEL, SIOCADDTUNNEL,
SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 73 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 7c158fdc02da..1fbb4dfbb191 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1267,60 +1267,45 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_get(struct net_device *dev, struct ifreq *ifr)
+ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 
-	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-			return -EFAULT;
-		t = ipip6_tunnel_locate(t->net, &p, 0);
-	}
+	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+		t = ipip6_tunnel_locate(t->net, p, 0);
 	if (!t)
 		t = netdev_priv(dev);
-
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
-		return -EFAULT;
+	memcpy(p, &t->parms, sizeof(*p));
 	return 0;
 }
 
 static int
-ipip6_tunnel_add(struct net_device *dev, struct ifreq *ifr)
+ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 	int err;
 
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-		return -EFAULT;
-	err = __ipip6_tunnel_ioctl_validate(t->net, &p);
+	err = __ipip6_tunnel_ioctl_validate(t->net, p);
 	if (err)
 		return err;
 
-	t = ipip6_tunnel_locate(t->net, &p, 1);
+	t = ipip6_tunnel_locate(t->net, p, 1);
 	if (!t)
 		return -ENOBUFS;
-
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
-		return -EFAULT;
 	return 0;
 }
 
 static int
-ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr)
+ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 	int err;
 
-	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-		return -EFAULT;
-	err = __ipip6_tunnel_ioctl_validate(t->net, &p);
+	err = __ipip6_tunnel_ioctl_validate(t->net, p);
 	if (err)
 		return err;
 
-	t = ipip6_tunnel_locate(t->net, &p, 0);
+	t = ipip6_tunnel_locate(t->net, p, 0);
 	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
 		if (!t)
 			return -ENOENT;
@@ -1329,33 +1314,28 @@ ipip6_tunnel_change(struct net_device *dev, struct ifreq *ifr)
 			if (t->dev != dev)
 				return -EEXIST;
 		} else {
-			if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
-			    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+			if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) ||
+			    (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr))
 				return -EINVAL;
 			t = netdev_priv(dev);
 		}
 
-		ipip6_tunnel_update(t, &p, t->fwmark);
+		ipip6_tunnel_update(t, p, t->fwmark);
 	}
 
-	if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
-		return -EFAULT;
 	return 0;
 }
 
 static int
-ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr)
+ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 
 	if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
-		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-			return -EFAULT;
-		t = ipip6_tunnel_locate(t->net, &p, 0);
+		t = ipip6_tunnel_locate(t->net, p, 0);
 		if (!t)
 			return -ENOENT;
 		if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev))
@@ -1366,18 +1346,32 @@ ipip6_tunnel_del(struct net_device *dev, struct ifreq *ifr)
 	return 0;
 }
 
+static int
+ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		return ipip6_tunnel_get(dev, p);
+	case SIOCADDTUNNEL:
+		return ipip6_tunnel_add(dev, p);
+	case SIOCCHGTUNNEL:
+		return ipip6_tunnel_change(dev, p);
+	case SIOCDELTUNNEL:
+		return ipip6_tunnel_del(dev, p);
+	default:
+		return -EINVAL;
+	}
+}
+
 static int
 ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	switch (cmd) {
 	case SIOCGETTUNNEL:
-		return ipip6_tunnel_get(dev, ifr);
 	case SIOCADDTUNNEL:
-		return ipip6_tunnel_add(dev, ifr);
 	case SIOCCHGTUNNEL:
-		return ipip6_tunnel_change(dev, ifr);
 	case SIOCDELTUNNEL:
-		return ipip6_tunnel_del(dev, ifr);
+		return ip_tunnel_ioctl(dev, ifr, cmd);
 	case SIOCGETPRL:
 		return ipip6_tunnel_get_prl(dev, ifr);
 	case SIOCADDPRL:
@@ -1404,6 +1398,7 @@ static const struct net_device_ops ipip6_netdev_ops = {
 	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
 	.ndo_get_stats64 = ip_tunnel_get_stats64,
 	.ndo_get_iflink = ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl = ipip6_tunnel_ctl,
 };
 
 static void ipip6_dev_free(struct net_device *dev)
-- 
cgit v1.2.3


From f098846044c9edb359bf2dae4bcf0d537dda22fe Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:17 +0200
Subject: ipv6: stub out even more of addrconf_set_dstaddr if SIT is disabled

There is no point in copying the structure from userspace or looking up
a device if SIT support is not disabled and we'll eventually return
-ENODEV anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab7e839753ae..8300176f91e7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2794,6 +2794,9 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg)
 	struct net_device *dev;
 	int err = -EINVAL;
 
+	if (!IS_ENABLED(CONFIG_IPV6_SIT))
+		return -ENODEV;
+
 	rtnl_lock();
 
 	err = -EFAULT;
@@ -2806,7 +2809,6 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg)
 	if (!dev)
 		goto err_exit;
 
-#if IS_ENABLED(CONFIG_IPV6_SIT)
 	if (dev->type == ARPHRD_SIT) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		struct ifreq ifr;
@@ -2842,7 +2844,6 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg)
 			err = dev_open(dev, NULL);
 		}
 	}
-#endif
 
 err_exit:
 	rtnl_unlock();
-- 
cgit v1.2.3


From 68ad6886dd3fb7d0b336363a90ace55b22f0dcb9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:18 +0200
Subject: ipv6: streamline addrconf_set_dstaddr

Factor out a addrconf_set_sit_dstaddr helper for the actual work if we
found a SIT device, and only hold the rtnl lock around the device lookup
and that new helper, as there is no point in holding it over a
copy_from_user call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 87 +++++++++++++++++++++++------------------------------
 1 file changed, 38 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8300176f91e7..c827edf87741 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2783,6 +2783,38 @@ put:
 	in6_dev_put(in6_dev);
 }
 
+static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
+		struct in6_ifreq *ireq)
+{
+	struct ip_tunnel_parm p = { };
+	mm_segment_t oldfs = get_fs();
+	struct ifreq ifr;
+	int err;
+
+	if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
+		return -EADDRNOTAVAIL;
+
+	p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
+	p.iph.version = 4;
+	p.iph.ihl = 5;
+	p.iph.protocol = IPPROTO_IPV6;
+	p.iph.ttl = 64;
+	ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+	if (!dev->netdev_ops->ndo_do_ioctl)
+		return -EOPNOTSUPP;
+	set_fs(KERNEL_DS);
+	err = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+	set_fs(oldfs);
+	if (err)
+		return err;
+
+	dev = __dev_get_by_name(net, p.name);
+	if (!dev)
+		return -ENOBUFS;
+	return dev_open(dev, NULL);
+}
+
 /*
  *	Set destination address.
  *	Special case for SIT interfaces where we create a new "virtual"
@@ -2790,62 +2822,19 @@ put:
  */
 int addrconf_set_dstaddr(struct net *net, void __user *arg)
 {
-	struct in6_ifreq ireq;
 	struct net_device *dev;
-	int err = -EINVAL;
+	struct in6_ifreq ireq;
+	int err = -ENODEV;
 
 	if (!IS_ENABLED(CONFIG_IPV6_SIT))
 		return -ENODEV;
-
-	rtnl_lock();
-
-	err = -EFAULT;
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
-		goto err_exit;
+		return -EFAULT;
 
+	rtnl_lock();
 	dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
-
-	err = -ENODEV;
-	if (!dev)
-		goto err_exit;
-
-	if (dev->type == ARPHRD_SIT) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-		struct ifreq ifr;
-		struct ip_tunnel_parm p;
-
-		err = -EADDRNOTAVAIL;
-		if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
-			goto err_exit;
-
-		memset(&p, 0, sizeof(p));
-		p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
-		p.iph.saddr = 0;
-		p.iph.version = 4;
-		p.iph.ihl = 5;
-		p.iph.protocol = IPPROTO_IPV6;
-		p.iph.ttl = 64;
-		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
-
-		if (ops->ndo_do_ioctl) {
-			mm_segment_t oldfs = get_fs();
-
-			set_fs(KERNEL_DS);
-			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
-			set_fs(oldfs);
-		} else
-			err = -EOPNOTSUPP;
-
-		if (err == 0) {
-			err = -ENOBUFS;
-			dev = __dev_get_by_name(net, p.name);
-			if (!dev)
-				goto err_exit;
-			err = dev_open(dev, NULL);
-		}
-	}
-
-err_exit:
+	if (dev && dev->type == ARPHRD_SIT)
+		err = addrconf_set_sit_dstaddr(net, dev, &ireq);
 	rtnl_unlock();
 	return err;
 }
-- 
cgit v1.2.3


From 8e3db0bbb29aa8d135341c5327bae738e93932a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2020 15:03:19 +0200
Subject: ipv6: use ->ndo_tunnel_ctl in addrconf_set_dstaddr

Use the new ->ndo_tunnel_ctl instead of overriding the address limit
and using ->ndo_do_ioctl just to do a pointless user copy.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c827edf87741..09cfbf5dd7ce 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2787,8 +2787,6 @@ static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
 		struct in6_ifreq *ireq)
 {
 	struct ip_tunnel_parm p = { };
-	mm_segment_t oldfs = get_fs();
-	struct ifreq ifr;
 	int err;
 
 	if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
@@ -2799,13 +2797,10 @@ static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
 	p.iph.ihl = 5;
 	p.iph.protocol = IPPROTO_IPV6;
 	p.iph.ttl = 64;
-	ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 
-	if (!dev->netdev_ops->ndo_do_ioctl)
+	if (!dev->netdev_ops->ndo_tunnel_ctl)
 		return -EOPNOTSUPP;
-	set_fs(KERNEL_DS);
-	err = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
-	set_fs(oldfs);
+	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 4f65e2f483b6f764c15094d14dd53dda048a4048 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 May 2020 15:50:12 -0700
Subject: net: unexport skb_gro_receive()

skb_gro_receive() used to be used by SCTP, it is no longer the case.

skb_gro_receive_list() is in the same category : never used from modules.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 35a133c6d13b..b8afefe6f6b6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(skb_gro_receive_list);
 
 /**
  *	skb_segment - Perform protocol segmentation on skb.
@@ -4191,7 +4190,6 @@ done:
 	NAPI_GRO_CB(skb)->same_flow = 1;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(skb_gro_receive);
 
 #ifdef CONFIG_SKB_EXTENSIONS
 #define SKB_EXT_ALIGN_VALUE	8
-- 
cgit v1.2.3


From 3ca44c16b0dcc764b641ee4ac226909f5c421aa3 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 19 May 2020 13:25:19 -0700
Subject: Bluetooth: Consolidate encryption handling in hci_encrypt_cfm

This makes hci_encrypt_cfm calls hci_connect_cfm in case the connection
state is BT_CONFIG so callers don't have to check the state.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 20 ++++++++++++++++++--
 net/bluetooth/hci_event.c        | 28 +++-------------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 5dcf85f186c6..cdd4f1db8670 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1381,10 +1381,26 @@ static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status)
 		conn->security_cfm_cb(conn, status);
 }
 
-static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status,
-								__u8 encrypt)
+static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status)
 {
 	struct hci_cb *cb;
+	__u8 encrypt;
+
+	if (conn->state == BT_CONFIG) {
+		if (status)
+			conn->state = BT_CONNECTED;
+
+		hci_connect_cfm(conn, status);
+		hci_conn_drop(conn);
+		return;
+	}
+
+	if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+		encrypt = 0x00;
+	else if (test_bit(HCI_CONN_AES_CCM, &conn->flags))
+		encrypt = 0x02;
+	else
+		encrypt = 0x01;
 
 	if (conn->sec_level == BT_SECURITY_SDP)
 		conn->sec_level = BT_SECURITY_LOW;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f024b3d57a1c..cfeaee347db3 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2931,7 +2931,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
 				     &cp);
 		} else {
 			clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
-			hci_encrypt_cfm(conn, ev->status, 0x00);
+			hci_encrypt_cfm(conn, ev->status);
 		}
 	}
 
@@ -3016,22 +3016,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
 		conn->enc_key_size = rp->key_size;
 	}
 
-	if (conn->state == BT_CONFIG) {
-		conn->state = BT_CONNECTED;
-		hci_connect_cfm(conn, 0);
-		hci_conn_drop(conn);
-	} else {
-		u8 encrypt;
-
-		if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
-			encrypt = 0x00;
-		else if (test_bit(HCI_CONN_AES_CCM, &conn->flags))
-			encrypt = 0x02;
-		else
-			encrypt = 0x01;
-
-		hci_encrypt_cfm(conn, 0, encrypt);
-	}
+	hci_encrypt_cfm(conn, 0);
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -3149,14 +3134,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
 	}
 
 notify:
-	if (conn->state == BT_CONFIG) {
-		if (!ev->status)
-			conn->state = BT_CONNECTED;
-
-		hci_connect_cfm(conn, ev->status);
-		hci_conn_drop(conn);
-	} else
-		hci_encrypt_cfm(conn, ev->status, ev->encrypt);
+	hci_encrypt_cfm(conn, ev->status);
 
 unlock:
 	hci_dev_unlock(hdev);
-- 
cgit v1.2.3


From 755dfcbca83710fa967d0efa7c5bb601f871a747 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 19 May 2020 13:25:17 -0700
Subject: Bluetooth: Fix assuming EIR flags can result in SSP authentication

EIR flags should just hint if SSP may be supported but we shall verify
this with use of the actual features as the SSP bits may be disabled in
the lower layers which would result in legacy authentication to be
used.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_conn.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 07c34c55fc50..307800fd18e6 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -225,8 +225,6 @@ static void hci_acl_create_connection(struct hci_conn *conn)
 		}
 
 		memcpy(conn->dev_class, ie->data.dev_class, 3);
-		if (ie->data.ssp_mode > 0)
-			set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
 	}
 
 	cp.pkt_type = cpu_to_le16(conn->pkt_type);
-- 
cgit v1.2.3


From f78cdbd75a57245ecc68f5a40e470933426a082b Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Wed, 20 May 2020 12:41:47 -0700
Subject: rds: fix crash in rds_info_getsockopt()

The conversion to pin_user_pages() had a bug: it overlooked
the case of allocation of pages failing. Fix that by restoring
an equivalent check.

Reported-by: syzbot+118ac0af4ac7f785a45b@syzkaller.appspotmail.com
Fixes: dbfe7d74376e ("rds: convert get_user_pages() --> pin_user_pages()")

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: rds-devel@oss.oracle.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/info.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/info.c b/net/rds/info.c
index e1d63563e81c..b6b46a8214a0 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -234,7 +234,8 @@ call_func:
 		ret = -EFAULT;
 
 out:
-	unpin_user_pages(pages, nr_pages);
+	if (pages)
+		unpin_user_pages(pages, nr_pages);
 	kfree(pages);
 
 	return ret;
-- 
cgit v1.2.3


From 63287de66df11308d239483415d67fe94079f47b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 9 May 2020 20:58:17 -0400
Subject: lift compat definitions of mcast [sg]etsockopt requests into
 net/compat.h

We want to get rid of compat_mc_[sg]etsockopt() and to have that stuff
handled without compat_alloc_user_space(), extra copying through
userland, etc.  To do that we'll need ipv4 and ipv6 instances of
->compat_[sg]etsockopt() to manipulate the 32bit variants of mcast
requests, so we need to move the definitions of those out of net/compat.c
and into a public header.

This patch just does a mechanical move to include/net/compat.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/compat.h | 24 ++++++++++++++++++++++++
 net/compat.c         | 25 -------------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/compat.h b/include/net/compat.h
index 2b5e1f7ba153..69a8cd29c0ae 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -74,4 +74,28 @@ int compat_mc_getsockopt(struct sock *, int, int, char __user *, int __user *,
 			 int (*)(struct sock *, int, int, char __user *,
 				 int __user *));
 
+struct compat_group_req {
+	__u32				 gr_interface;
+	struct __kernel_sockaddr_storage gr_group
+		__aligned(4);
+} __packed;
+
+struct compat_group_source_req {
+	__u32				 gsr_interface;
+	struct __kernel_sockaddr_storage gsr_group
+		__aligned(4);
+	struct __kernel_sockaddr_storage gsr_source
+		__aligned(4);
+} __packed;
+
+struct compat_group_filter {
+	__u32				 gf_interface;
+	struct __kernel_sockaddr_storage gf_group
+		__aligned(4);
+	__u32				 gf_fmode;
+	__u32				 gf_numsrc;
+	struct __kernel_sockaddr_storage gf_slist[1]
+		__aligned(4);
+} __packed;
+
 #endif /* NET_COMPAT_H */
diff --git a/net/compat.c b/net/compat.c
index 69fc6d1e4e6e..032114de4fec 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -448,34 +448,9 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
 	return __compat_sys_getsockopt(fd, level, optname, optval, optlen);
 }
 
-struct compat_group_req {
-	__u32				 gr_interface;
-	struct __kernel_sockaddr_storage gr_group
-		__aligned(4);
-} __packed;
-
-struct compat_group_source_req {
-	__u32				 gsr_interface;
-	struct __kernel_sockaddr_storage gsr_group
-		__aligned(4);
-	struct __kernel_sockaddr_storage gsr_source
-		__aligned(4);
-} __packed;
-
-struct compat_group_filter {
-	__u32				 gf_interface;
-	struct __kernel_sockaddr_storage gf_group
-		__aligned(4);
-	__u32				 gf_fmode;
-	__u32				 gf_numsrc;
-	struct __kernel_sockaddr_storage gf_slist[1]
-		__aligned(4);
-} __packed;
-
 #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
 			sizeof(struct __kernel_sockaddr_storage))
 
-
 int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 	char __user *optval, unsigned int optlen,
 	int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
-- 
cgit v1.2.3


From e9c375fb5edeb550786d1436784db909bf672e9f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 9 May 2020 21:16:36 -0400
Subject: compat_ip{,v6}_setsockopt(): enumerate MCAST_... options explicitly

We want to check if optname is among the MCAST_... ones; do that as
an explicit switch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c   | 10 +++++++++-
 net/ipv6/ipv6_sockglue.c | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8206047d70b6..3c2c6cd3933b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1272,9 +1272,17 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 	if (level != SOL_IP)
 		return -ENOPROTOOPT;
 
-	if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+	switch (optname) {
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	case MCAST_MSFILTER:
 		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
 			ip_setsockopt);
+	}
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
 #ifdef CONFIG_NETFILTER
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a0e50cc57e54..96e3f603c8d8 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -973,9 +973,17 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	if (level != SOL_IPV6)
 		return -ENOPROTOOPT;
 
-	if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+	switch (optname) {
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	case MCAST_MSFILTER:
 		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
 			ipv6_setsockopt);
+	}
 
 	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
 #ifdef CONFIG_NETFILTER
-- 
cgit v1.2.3


From 931ca7ab7fe804d77bc6952f1512950c0d870f26 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2020 17:18:30 -0400
Subject: ip*_mc_gsfget(): lift copyout of struct group_filter into callers

pass the userland pointer to the array in its tail, so that part
gets copied out by our functions; copyout of everything else is
done in the callers.  Rationale: reuse for compat; the array
is the same in native and compat, the layout of parts before it
is different for compat.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/igmp.h     |  2 +-
 include/net/ipv6.h       |  2 +-
 net/ipv4/igmp.c          | 18 +++++-------------
 net/ipv4/ip_sockglue.c   | 19 ++++++++++++++-----
 net/ipv6/ipv6_sockglue.c | 18 ++++++++++++++----
 net/ipv6/mcast.c         | 10 +++-------
 6 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index faa6586a5783..64ce8cd1cfaf 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -123,7 +123,7 @@ extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex);
 extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		struct ip_msfilter __user *optval, int __user *optlen);
 extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
-		struct group_filter __user *optval, int __user *optlen);
+			struct sockaddr_storage __user *p);
 extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
 			  int dif, int sdif);
 extern void ip_mc_init_dev(struct in_device *);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5fc3a9d7b053..c45eb78d970f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1138,7 +1138,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 		  struct group_source_req *pgsr);
 int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
 int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
-		  struct group_filter __user *optval, int __user *optlen);
+		  struct sockaddr_storage __user *p);
 
 #ifdef CONFIG_PROC_FS
 int ac6_proc_init(struct net *net);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 47f0502b2101..7b272bbed2b4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2565,9 +2565,9 @@ done:
 }
 
 int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
-	struct group_filter __user *optval, int __user *optlen)
+	struct sockaddr_storage __user *p)
 {
-	int err, i, count, copycount;
+	int i, count, copycount;
 	struct sockaddr_in *psin;
 	__be32 addr;
 	struct ip_mc_socklist *pmc;
@@ -2583,37 +2583,29 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	if (!ipv4_is_multicast(addr))
 		return -EINVAL;
 
-	err = -EADDRNOTAVAIL;
-
 	for_each_pmc_rtnl(inet, pmc) {
 		if (pmc->multi.imr_multiaddr.s_addr == addr &&
 		    pmc->multi.imr_ifindex == gsf->gf_interface)
 			break;
 	}
 	if (!pmc)		/* must have a prior join */
-		goto done;
+		return -EADDRNOTAVAIL;
 	gsf->gf_fmode = pmc->sfmode;
 	psl = rtnl_dereference(pmc->sflist);
 	count = psl ? psl->sl_count : 0;
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
-	if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
-	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
-		return -EFAULT;
-	}
-	for (i = 0; i < copycount; i++) {
+	for (i = 0; i < copycount; i++, p++) {
 		struct sockaddr_storage ss;
 
 		psin = (struct sockaddr_in *)&ss;
 		memset(&ss, 0, sizeof(ss));
 		psin->sin_family = AF_INET;
 		psin->sin_addr.s_addr = psl->sl_addr[i];
-		if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+		if (copy_to_user(p, &ss, sizeof(ss)))
 			return -EFAULT;
 	}
 	return 0;
-done:
-	return err;
 }
 
 /*
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 3c2c6cd3933b..e3703a3e7ef4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1473,19 +1473,28 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	}
 	case MCAST_MSFILTER:
 	{
+		struct group_filter __user *p = (void __user *)optval;
 		struct group_filter gsf;
+		const int size0 = offsetof(struct group_filter, gf_slist);
+		int num;
 
-		if (len < GROUP_FILTER_SIZE(0)) {
+		if (len < size0) {
 			err = -EINVAL;
 			goto out;
 		}
-		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+		if (copy_from_user(&gsf, p, size0)) {
 			err = -EFAULT;
 			goto out;
 		}
-		err = ip_mc_gsfget(sk, &gsf,
-				   (struct group_filter __user *)optval,
-				   optlen);
+		num = gsf.gf_numsrc;
+		err = ip_mc_gsfget(sk, &gsf, p->gf_slist);
+		if (err)
+			goto out;
+		if (gsf.gf_numsrc < num)
+			num = gsf.gf_numsrc;
+		if (put_user(GROUP_FILTER_SIZE(num), optlen) ||
+		    copy_to_user(p, &gsf, size0))
+			err = -EFAULT;
 		goto out;
 	}
 	case IP_MULTICAST_ALL:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 96e3f603c8d8..e4a62ca1a3d0 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1056,18 +1056,28 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	case MCAST_MSFILTER:
 	{
+		struct group_filter __user *p = (void __user *)optval;
 		struct group_filter gsf;
+		const int size0 = offsetof(struct group_filter, gf_slist);
+		int num;
 		int err;
 
-		if (len < GROUP_FILTER_SIZE(0))
+		if (len < size0)
 			return -EINVAL;
-		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
+		if (copy_from_user(&gsf, p, size0))
 			return -EFAULT;
 		if (gsf.gf_group.ss_family != AF_INET6)
 			return -EADDRNOTAVAIL;
+		num = gsf.gf_numsrc;
 		lock_sock(sk);
-		err = ip6_mc_msfget(sk, &gsf,
-			(struct group_filter __user *)optval, optlen);
+		err = ip6_mc_msfget(sk, &gsf, p->gf_slist);
+		if (!err) {
+			if (num > gsf.gf_numsrc)
+				num = gsf.gf_numsrc;
+			if (put_user(GROUP_FILTER_SIZE(num), optlen) ||
+			    copy_to_user(p, &gsf, size0))
+				err = -EFAULT;
+		}
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index eaa4c2cc2fbb..97d796c7d6c0 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -547,7 +547,7 @@ done:
 }
 
 int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
-	struct group_filter __user *optval, int __user *optlen)
+	struct sockaddr_storage *p)
 {
 	int err, i, count, copycount;
 	const struct in6_addr *group;
@@ -592,14 +592,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
-	if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
-	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
-		return -EFAULT;
-	}
 	/* changes to psl require the socket lock, and a write lock
 	 * on pmc->sflock. We have the socket lock so reading here is safe.
 	 */
-	for (i = 0; i < copycount; i++) {
+	for (i = 0; i < copycount; i++, p++) {
 		struct sockaddr_in6 *psin6;
 		struct sockaddr_storage ss;
 
@@ -607,7 +603,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 		memset(&ss, 0, sizeof(ss));
 		psin6->sin6_family = AF_INET6;
 		psin6->sin6_addr = psl->sl_addr[i];
-		if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+		if (copy_to_user(p, &ss, sizeof(ss)))
 			return -EFAULT;
 	}
 	return 0;
-- 
cgit v1.2.3


From 0dfe6581a7e35bafe5fc4d9b84edd0e66b4fd78a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2020 22:08:59 -0400
Subject: get rid of compat_mc_getsockopt()

now we can do MCAST_MSFILTER in compat ->getsockopt() without
playing silly buggers with copying things back and forth.
We can form a native struct group_filter (sans the variable-length
tail) on stack, pass that + pointer to the tail of original request
to the helper doing the bulk of the work, then do the rest of
copyout - same as the native getsockopt() does.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/compat.h     |  3 --
 net/compat.c             | 79 ------------------------------------------------
 net/ipv4/ip_sockglue.c   | 44 +++++++++++++++++++++++++--
 net/ipv6/ipv6_sockglue.c | 41 +++++++++++++++++++++++--
 4 files changed, 79 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/include/net/compat.h b/include/net/compat.h
index 69a8cd29c0ae..d714076d63d5 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -70,9 +70,6 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *, struct sock *,
 int compat_mc_setsockopt(struct sock *, int, int, char __user *, unsigned int,
 			 int (*)(struct sock *, int, int, char __user *,
 				 unsigned int));
-int compat_mc_getsockopt(struct sock *, int, int, char __user *, int __user *,
-			 int (*)(struct sock *, int, int, char __user *,
-				 int __user *));
 
 struct compat_group_req {
 	__u32				 gr_interface;
diff --git a/net/compat.c b/net/compat.c
index 032114de4fec..7bdfda2b382a 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -538,85 +538,6 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 }
 EXPORT_SYMBOL(compat_mc_setsockopt);
 
-int compat_mc_getsockopt(struct sock *sock, int level, int optname,
-	char __user *optval, int __user *optlen,
-	int (*getsockopt)(struct sock *, int, int, char __user *, int __user *))
-{
-	struct compat_group_filter __user *gf32 = (void __user *)optval;
-	struct group_filter __user *kgf;
-	int __user	*koptlen;
-	u32 interface, fmode, numsrc;
-	int klen, ulen, err;
-
-	if (optname != MCAST_MSFILTER)
-		return getsockopt(sock, level, optname, optval, optlen);
-
-	koptlen = compat_alloc_user_space(sizeof(*koptlen));
-	if (!access_ok(optlen, sizeof(*optlen)) ||
-	    __get_user(ulen, optlen))
-		return -EFAULT;
-
-	/* adjust len for pad */
-	klen = ulen + sizeof(*kgf) - sizeof(*gf32);
-
-	if (klen < GROUP_FILTER_SIZE(0))
-		return -EINVAL;
-
-	if (!access_ok(koptlen, sizeof(*koptlen)) ||
-	    __put_user(klen, koptlen))
-		return -EFAULT;
-
-	/* have to allow space for previous compat_alloc_user_space, too */
-	kgf = compat_alloc_user_space(klen+sizeof(*optlen));
-
-	if (!access_ok(gf32, __COMPAT_GF0_SIZE) ||
-	    __get_user(interface, &gf32->gf_interface) ||
-	    __get_user(fmode, &gf32->gf_fmode) ||
-	    __get_user(numsrc, &gf32->gf_numsrc) ||
-	    __put_user(interface, &kgf->gf_interface) ||
-	    __put_user(fmode, &kgf->gf_fmode) ||
-	    __put_user(numsrc, &kgf->gf_numsrc) ||
-	    copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group)))
-		return -EFAULT;
-
-	err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
-	if (err)
-		return err;
-
-	if (!access_ok(koptlen, sizeof(*koptlen)) ||
-	    __get_user(klen, koptlen))
-		return -EFAULT;
-
-	ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
-
-	if (!access_ok(optlen, sizeof(*optlen)) ||
-	    __put_user(ulen, optlen))
-		return -EFAULT;
-
-	if (!access_ok(kgf, klen) ||
-	    !access_ok(gf32, ulen) ||
-	    __get_user(interface, &kgf->gf_interface) ||
-	    __get_user(fmode, &kgf->gf_fmode) ||
-	    __get_user(numsrc, &kgf->gf_numsrc) ||
-	    __put_user(interface, &gf32->gf_interface) ||
-	    __put_user(fmode, &gf32->gf_fmode) ||
-	    __put_user(numsrc, &gf32->gf_numsrc))
-		return -EFAULT;
-	if (numsrc) {
-		int copylen;
-
-		klen -= GROUP_FILTER_SIZE(0);
-		copylen = numsrc * sizeof(gf32->gf_slist[0]);
-		if (copylen > klen)
-			copylen = klen;
-		if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
-			return -EFAULT;
-	}
-	return err;
-}
-EXPORT_SYMBOL(compat_mc_getsockopt);
-
-
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
 static unsigned char nas[21] = {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e3703a3e7ef4..65a30e7672ff 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1607,9 +1607,47 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 {
 	int err;
 
-	if (optname == MCAST_MSFILTER)
-		return compat_mc_getsockopt(sk, level, optname, optval, optlen,
-			ip_getsockopt);
+	if (optname == MCAST_MSFILTER) {
+		const int size0 = offsetof(struct compat_group_filter, gf_slist);
+		struct compat_group_filter __user *p = (void __user *)optval;
+		struct compat_group_filter gf32;
+		struct group_filter gf;
+		int ulen, err;
+		int num;
+
+		if (level != SOL_IP)
+			return -EOPNOTSUPP;
+
+		if (get_user(ulen, optlen))
+			return -EFAULT;
+
+		if (ulen < size0)
+			return -EINVAL;
+
+		if (copy_from_user(&gf32, p, size0))
+			return -EFAULT;
+
+		gf.gf_interface = gf32.gf_interface;
+		gf.gf_fmode = gf32.gf_fmode;
+		num = gf.gf_numsrc = gf32.gf_numsrc;
+		gf.gf_group = gf32.gf_group;
+
+		rtnl_lock();
+		lock_sock(sk);
+		err = ip_mc_gsfget(sk, &gf, p->gf_slist);
+		release_sock(sk);
+		rtnl_unlock();
+		if (err)
+			return err;
+		if (gf.gf_numsrc < num)
+			num = gf.gf_numsrc;
+		ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32));
+		if (put_user(ulen, optlen) ||
+		    put_user(gf.gf_fmode, &p->gf_fmode) ||
+		    put_user(gf.gf_numsrc, &p->gf_numsrc))
+			return -EFAULT;
+		return 0;
+	}
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e4a62ca1a3d0..0bbafe73bdde 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1446,9 +1446,44 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	if (level != SOL_IPV6)
 		return -ENOPROTOOPT;
 
-	if (optname == MCAST_MSFILTER)
-		return compat_mc_getsockopt(sk, level, optname, optval, optlen,
-			ipv6_getsockopt);
+	if (optname == MCAST_MSFILTER) {
+		const int size0 = offsetof(struct compat_group_filter, gf_slist);
+		struct compat_group_filter __user *p = (void __user *)optval;
+		struct compat_group_filter gf32;
+		struct group_filter gf;
+		int ulen, err;
+		int num;
+
+		if (get_user(ulen, optlen))
+			return -EFAULT;
+
+		if (ulen < size0)
+			return -EINVAL;
+
+		if (copy_from_user(&gf32, p, size0))
+			return -EFAULT;
+
+		gf.gf_interface = gf32.gf_interface;
+		gf.gf_fmode = gf32.gf_fmode;
+		num = gf.gf_numsrc = gf32.gf_numsrc;
+		gf.gf_group = gf32.gf_group;
+
+		if (gf.gf_group.ss_family != AF_INET6)
+			return -EADDRNOTAVAIL;
+		lock_sock(sk);
+		err = ip6_mc_msfget(sk, &gf, p->gf_slist);
+		release_sock(sk);
+		if (err)
+			return err;
+		if (num > gf.gf_numsrc)
+			num = gf.gf_numsrc;
+		ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32));
+		if (put_user(ulen, optlen) ||
+		    put_user(gf.gf_fmode, &p->gf_fmode) ||
+		    put_user(gf.gf_numsrc, &p->gf_numsrc))
+			return -EFAULT;
+		return 0;
+	}
 
 	err = do_ipv6_getsockopt(sk, level, optname, optval, optlen,
 				 MSG_CMSG_COMPAT);
-- 
cgit v1.2.3


From e986d4dabcb8fab87b36b607bc710e42fe206baf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2020 22:37:56 -0400
Subject: set_mcast_msfilter(): take the guts of setsockopt(MCAST_MSFILTER)
 into a helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c | 73 +++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 65a30e7672ff..cc0441157b02 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -587,6 +587,43 @@ static bool setsockopt_needs_rtnl(int optname)
 	return false;
 }
 
+static int set_mcast_msfilter(struct sock *sk, int ifindex,
+			      int numsrc, int fmode,
+			      struct sockaddr_storage *group,
+			      struct sockaddr_storage *list)
+{
+	int msize = IP_MSFILTER_SIZE(numsrc);
+	struct ip_msfilter *msf;
+	struct sockaddr_in *psin;
+	int err, i;
+
+	msf = kmalloc(msize, GFP_KERNEL);
+	if (!msf)
+		return -ENOBUFS;
+
+	psin = (struct sockaddr_in *)group;
+	if (psin->sin_family != AF_INET)
+		goto Eaddrnotavail;
+	msf->imsf_multiaddr = psin->sin_addr.s_addr;
+	msf->imsf_interface = 0;
+	msf->imsf_fmode = fmode;
+	msf->imsf_numsrc = numsrc;
+	for (i = 0; i < numsrc; ++i) {
+		psin = (struct sockaddr_in *)&list[i];
+
+		if (psin->sin_family != AF_INET)
+			goto Eaddrnotavail;
+		msf->imsf_slist[i] = psin->sin_addr.s_addr;
+	}
+	err = ip_mc_msfilter(sk, msf, ifindex);
+	kfree(msf);
+	return err;
+
+Eaddrnotavail:
+	kfree(msf);
+	return -EADDRNOTAVAIL;
+}
+
 static int do_ip_setsockopt(struct sock *sk, int level,
 			    int optname, char __user *optval, unsigned int optlen)
 {
@@ -1079,10 +1116,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	}
 	case MCAST_MSFILTER:
 	{
-		struct sockaddr_in *psin;
-		struct ip_msfilter *msf = NULL;
 		struct group_filter *gsf = NULL;
-		int msize, i, ifindex;
 
 		if (optlen < GROUP_FILTER_SIZE(0))
 			goto e_inval;
@@ -1095,7 +1129,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			err = PTR_ERR(gsf);
 			break;
 		}
-
 		/* numsrc >= (4G-140)/128 overflow in 32 bits */
 		if (gsf->gf_numsrc >= 0x1ffffff ||
 		    gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
@@ -1106,36 +1139,10 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			err = -EINVAL;
 			goto mc_msf_out;
 		}
-		msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
-		msf = kmalloc(msize, GFP_KERNEL);
-		if (!msf) {
-			err = -ENOBUFS;
-			goto mc_msf_out;
-		}
-		ifindex = gsf->gf_interface;
-		psin = (struct sockaddr_in *)&gsf->gf_group;
-		if (psin->sin_family != AF_INET) {
-			err = -EADDRNOTAVAIL;
-			goto mc_msf_out;
-		}
-		msf->imsf_multiaddr = psin->sin_addr.s_addr;
-		msf->imsf_interface = 0;
-		msf->imsf_fmode = gsf->gf_fmode;
-		msf->imsf_numsrc = gsf->gf_numsrc;
-		err = -EADDRNOTAVAIL;
-		for (i = 0; i < gsf->gf_numsrc; ++i) {
-			psin = (struct sockaddr_in *)&gsf->gf_slist[i];
-
-			if (psin->sin_family != AF_INET)
-				goto mc_msf_out;
-			msf->imsf_slist[i] = psin->sin_addr.s_addr;
-		}
-		kfree(gsf);
-		gsf = NULL;
-
-		err = ip_mc_msfilter(sk, msf, ifindex);
+		err = set_mcast_msfilter(sk, gsf->gf_interface,
+					 gsf->gf_numsrc, gsf->gf_fmode,
+					 &gsf->gf_group, gsf->gf_slist);
 mc_msf_out:
-		kfree(msf);
 		kfree(gsf);
 		break;
 	}
-- 
cgit v1.2.3


From 2e04172875c9daf929659eb5c3ef4b98fdf34396 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2020 15:39:43 -0400
Subject: ipv4: do compat setsockopt for MCAST_MSFILTER directly

Parallel to what the native setsockopt() does, except that unlike
the native setsockopt() we do not use memdup_user() - we want
the sockaddr_storage fields properly aligned, so we allocate
4 bytes more and copy compat_group_filter at the offset 4,
which yields the proper alignments.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cc0441157b02..b6b889b5dacf 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1286,9 +1286,55 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
-	case MCAST_MSFILTER:
 		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
 			ip_setsockopt);
+	case MCAST_MSFILTER:
+	{
+		const int size0 = offsetof(struct compat_group_filter, gf_slist);
+		struct compat_group_filter *gf32;
+		void *p;
+		int n;
+
+		if (optlen < size0)
+			return -EINVAL;
+		if (optlen > sysctl_optmem_max - 4)
+			return -ENOBUFS;
+
+		p = kmalloc(optlen + 4, GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+		gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+		if (copy_from_user(gf32, optval, optlen)) {
+			err = -EFAULT;
+			goto mc_msf_out;
+		}
+
+		n = gf32->gf_numsrc;
+		/* numsrc >= (4G-140)/128 overflow in 32 bits */
+		if (n >= 0x1ffffff) {
+			err = -ENOBUFS;
+			goto mc_msf_out;
+		}
+		if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) {
+			err = -EINVAL;
+			goto mc_msf_out;
+		}
+
+		rtnl_lock();
+		lock_sock(sk);
+		/* numsrc >= (4G-140)/128 overflow in 32 bits */
+		if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf)
+			err = -ENOBUFS;
+		else
+			err = set_mcast_msfilter(sk, gf32->gf_interface,
+						 n, gf32->gf_fmode,
+						 &gf32->gf_group, gf32->gf_slist);
+		release_sock(sk);
+		rtnl_unlock();
+mc_msf_out:
+		kfree(p);
+		return err;
+	}
 	}
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-- 
cgit v1.2.3


From d59eb177c84f9572a6b51024c0b2611c3b5a27c5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2020 15:43:10 -0400
Subject: ip6_mc_msfilter(): pass the address list separately

that way we'll be able to reuse it for compat case

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/ipv6.h       | 3 ++-
 net/ipv6/ipv6_sockglue.c | 2 +-
 net/ipv6/mcast.c         | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c45eb78d970f..39a00d3ef5e2 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1136,7 +1136,8 @@ struct group_filter;
 
 int ip6_mc_source(int add, int omode, struct sock *sk,
 		  struct group_source_req *pgsr);
-int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
+		  struct sockaddr_storage *list);
 int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 		  struct sockaddr_storage __user *p);
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 0bbafe73bdde..7d3ecc0e69d1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -780,7 +780,7 @@ done:
 			retv = -EINVAL;
 			break;
 		}
-		retv = ip6_mc_msfilter(sk, gsf);
+		retv = ip6_mc_msfilter(sk, gsf, gsf->gf_slist);
 		kfree(gsf);
 
 		break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 97d796c7d6c0..7e12d2114158 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -457,7 +457,8 @@ done:
 	return err;
 }
 
-int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
+		    struct sockaddr_storage *list)
 {
 	const struct in6_addr *group;
 	struct ipv6_mc_socklist *pmc;
@@ -509,10 +510,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 			goto done;
 		}
 		newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
-		for (i = 0; i < newpsl->sl_count; ++i) {
+		for (i = 0; i < newpsl->sl_count; ++i, ++list) {
 			struct sockaddr_in6 *psin6;
 
-			psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+			psin6 = (struct sockaddr_in6 *)list;
 			newpsl->sl_addr[i] = psin6->sin6_addr;
 		}
 		err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
-- 
cgit v1.2.3


From 168a2cca81438aef819e43feb161614488dee97b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2020 16:40:04 -0400
Subject: ipv6: do compat setsockopt for MCAST_MSFILTER directly

similar to the ipv4 counterpart of that patch - the same
trick used to align the tail array properly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv6/ipv6_sockglue.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7d3ecc0e69d1..2b5029df8f1e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -980,9 +980,55 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
-	case MCAST_MSFILTER:
 		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
 			ipv6_setsockopt);
+	case MCAST_MSFILTER:
+	{
+		const int size0 = offsetof(struct compat_group_filter, gf_slist);
+		struct compat_group_filter *gf32;
+		void *p;
+		int n;
+
+		if (optlen < size0)
+			return -EINVAL;
+		if (optlen > sysctl_optmem_max - 4)
+			return -ENOBUFS;
+
+		p = kmalloc(optlen + 4, GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+
+		gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+		if (copy_from_user(gf32, optval, optlen)) {
+			err = -EFAULT;
+			goto mc_msf_out;
+		}
+
+		n = gf32->gf_numsrc;
+		/* numsrc >= (4G-140)/128 overflow in 32 bits */
+		if (n >= 0x1ffffffU ||
+		    n > sysctl_mld_max_msf) {
+			err = -ENOBUFS;
+			goto mc_msf_out;
+		}
+		if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) {
+			err = -EINVAL;
+			goto mc_msf_out;
+		}
+
+		rtnl_lock();
+		lock_sock(sk);
+		err = ip6_mc_msfilter(sk, &(struct group_filter){
+				.gf_interface = gf32->gf_interface,
+				.gf_group = gf32->gf_group,
+				.gf_fmode = gf32->gf_fmode,
+				.gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist);
+		release_sock(sk);
+		rtnl_unlock();
+mc_msf_out:
+		kfree(p);
+		return err;
+	}
 	}
 
 	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
-- 
cgit v1.2.3


From 2f984f11fdc06bcfd5bb528d07a93c20301dd068 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Apr 2020 19:56:22 -0400
Subject: ipv[46]: do compat setsockopt for MCAST_{JOIN,LEAVE}_GROUP directly

direct parallel to the way these two are handled in the native
->setsockopt() instances - the helpers that do the real work
are already separated and can be reused as-is in this case.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c   | 31 +++++++++++++++++++++++++++++++
 net/ipv6/ipv6_sockglue.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b6b889b5dacf..34c3a43a9c98 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1282,6 +1282,37 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 	switch (optname) {
 	case MCAST_JOIN_GROUP:
 	case MCAST_LEAVE_GROUP:
+	{
+		struct compat_group_req __user *gr32 = (void __user *)optval;
+		struct group_req greq;
+		struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group;
+		struct ip_mreqn mreq;
+
+		if (optlen < sizeof(struct compat_group_req))
+			return -EINVAL;
+
+		if (get_user(greq.gr_interface, &gr32->gr_interface) ||
+		    copy_from_user(&greq.gr_group, &gr32->gr_group,
+				sizeof(greq.gr_group)))
+			return -EFAULT;
+
+		if (psin->sin_family != AF_INET)
+			return -EINVAL;
+
+		memset(&mreq, 0, sizeof(mreq));
+		mreq.imr_multiaddr = psin->sin_addr;
+		mreq.imr_ifindex = greq.gr_interface;
+
+		rtnl_lock();
+		lock_sock(sk);
+		if (optname == MCAST_JOIN_GROUP)
+			err = ip_mc_join_group(sk, &mreq);
+		else
+			err = ip_mc_leave_group(sk, &mreq);
+		release_sock(sk);
+		rtnl_unlock();
+		return err;
+	}
 	case MCAST_JOIN_SOURCE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 2b5029df8f1e..209d827950cc 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -976,6 +976,34 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	switch (optname) {
 	case MCAST_JOIN_GROUP:
 	case MCAST_LEAVE_GROUP:
+	{
+		struct compat_group_req __user *gr32 = (void __user *)optval;
+		struct group_req greq;
+		struct sockaddr_in6 *psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+
+		if (optlen < sizeof(struct compat_group_req))
+			return -EINVAL;
+
+		if (get_user(greq.gr_interface, &gr32->gr_interface) ||
+		    copy_from_user(&greq.gr_group, &gr32->gr_group,
+				sizeof(greq.gr_group)))
+			return -EFAULT;
+
+		if (greq.gr_group.ss_family != AF_INET6)
+			return -EADDRNOTAVAIL;
+
+		rtnl_lock();
+		lock_sock(sk);
+		if (optname == MCAST_JOIN_GROUP)
+			err = ipv6_sock_mc_join(sk, greq.gr_interface,
+						 &psin6->sin6_addr);
+		else
+			err = ipv6_sock_mc_drop(sk, greq.gr_interface,
+						 &psin6->sin6_addr);
+		release_sock(sk);
+		rtnl_unlock();
+		return err;
+	}
 	case MCAST_JOIN_SOURCE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
-- 
cgit v1.2.3


From 2bbf8c1ead651c65bd0f7b6ba6d57cb09e2d1a57 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Apr 2020 10:49:26 -0400
Subject: ipv4: take handling of group_source_req options into a helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c | 83 ++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 34c3a43a9c98..7f065a68664e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -624,6 +624,49 @@ Eaddrnotavail:
 	return -EADDRNOTAVAIL;
 }
 
+static int do_mcast_group_source(struct sock *sk, int optname,
+				 struct group_source_req *greqs)
+{
+	struct ip_mreq_source mreqs;
+	struct sockaddr_in *psin;
+	int omode, add, err;
+
+	if (greqs->gsr_group.ss_family != AF_INET ||
+	    greqs->gsr_source.ss_family != AF_INET)
+		return -EADDRNOTAVAIL;
+
+	psin = (struct sockaddr_in *)&greqs->gsr_group;
+	mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+	psin = (struct sockaddr_in *)&greqs->gsr_source;
+	mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+	mreqs.imr_interface = 0; /* use index for mc_source */
+
+	if (optname == MCAST_BLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 1;
+	} else if (optname == MCAST_UNBLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 0;
+	} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+		struct ip_mreqn mreq;
+
+		psin = (struct sockaddr_in *)&greqs->gsr_group;
+		mreq.imr_multiaddr = psin->sin_addr;
+		mreq.imr_address.s_addr = 0;
+		mreq.imr_ifindex = greqs->gsr_interface;
+		err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+		if (err && err != -EADDRINUSE)
+			return err;
+		greqs->gsr_interface = mreq.imr_ifindex;
+		omode = MCAST_INCLUDE;
+		add = 1;
+	} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+		omode = MCAST_INCLUDE;
+		add = 0;
+	}
+	return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface);
+}
+
 static int do_ip_setsockopt(struct sock *sk, int level,
 			    int optname, char __user *optval, unsigned int optlen)
 {
@@ -1066,9 +1109,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	case MCAST_UNBLOCK_SOURCE:
 	{
 		struct group_source_req greqs;
-		struct ip_mreq_source mreqs;
-		struct sockaddr_in *psin;
-		int omode, add;
 
 		if (optlen != sizeof(struct group_source_req))
 			goto e_inval;
@@ -1076,42 +1116,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			err = -EFAULT;
 			break;
 		}
-		if (greqs.gsr_group.ss_family != AF_INET ||
-		    greqs.gsr_source.ss_family != AF_INET) {
-			err = -EADDRNOTAVAIL;
-			break;
-		}
-		psin = (struct sockaddr_in *)&greqs.gsr_group;
-		mreqs.imr_multiaddr = psin->sin_addr.s_addr;
-		psin = (struct sockaddr_in *)&greqs.gsr_source;
-		mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
-		mreqs.imr_interface = 0; /* use index for mc_source */
-
-		if (optname == MCAST_BLOCK_SOURCE) {
-			omode = MCAST_EXCLUDE;
-			add = 1;
-		} else if (optname == MCAST_UNBLOCK_SOURCE) {
-			omode = MCAST_EXCLUDE;
-			add = 0;
-		} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
-			struct ip_mreqn mreq;
-
-			psin = (struct sockaddr_in *)&greqs.gsr_group;
-			mreq.imr_multiaddr = psin->sin_addr;
-			mreq.imr_address.s_addr = 0;
-			mreq.imr_ifindex = greqs.gsr_interface;
-			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
-			if (err && err != -EADDRINUSE)
-				break;
-			greqs.gsr_interface = mreq.imr_ifindex;
-			omode = MCAST_INCLUDE;
-			add = 1;
-		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
-			omode = MCAST_INCLUDE;
-			add = 0;
-		}
-		err = ip_mc_source(add, omode, sk, &mreqs,
-				   greqs.gsr_interface);
+		err = do_mcast_group_source(sk, optname, &greqs);
 		break;
 	}
 	case MCAST_MSFILTER:
-- 
cgit v1.2.3


From fcfa0b09d3f794af66cf8bcee03ddb7055934742 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Apr 2020 11:00:01 -0400
Subject: ipv6: take handling of group_source_req options into a helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv6/ipv6_sockglue.c | 65 +++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 209d827950cc..bb049feeb787 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -136,6 +136,41 @@ static bool setsockopt_needs_rtnl(int optname)
 	return false;
 }
 
+static int do_ipv6_mcast_group_source(struct sock *sk, int optname,
+				      struct group_source_req *greqs)
+{
+	int omode, add;
+
+	if (greqs->gsr_group.ss_family != AF_INET6 ||
+	    greqs->gsr_source.ss_family != AF_INET6)
+		return -EADDRNOTAVAIL;
+
+	if (optname == MCAST_BLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 1;
+	} else if (optname == MCAST_UNBLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 0;
+	} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+		struct sockaddr_in6 *psin6;
+		int retv;
+
+		psin6 = (struct sockaddr_in6 *)&greqs->gsr_group;
+		retv = ipv6_sock_mc_join_ssm(sk, greqs->gsr_interface,
+					     &psin6->sin6_addr,
+					     MCAST_INCLUDE);
+		/* prior join w/ different source is ok */
+		if (retv && retv != -EADDRINUSE)
+			return retv;
+		omode = MCAST_INCLUDE;
+		add = 1;
+	} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+		omode = MCAST_INCLUDE;
+		add = 0;
+	}
+	return ip6_mc_source(add, omode, sk, greqs);
+}
+
 static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
@@ -715,7 +750,6 @@ done:
 	case MCAST_UNBLOCK_SOURCE:
 	{
 		struct group_source_req greqs;
-		int omode, add;
 
 		if (optlen < sizeof(struct group_source_req))
 			goto e_inval;
@@ -723,34 +757,7 @@ done:
 			retv = -EFAULT;
 			break;
 		}
-		if (greqs.gsr_group.ss_family != AF_INET6 ||
-		    greqs.gsr_source.ss_family != AF_INET6) {
-			retv = -EADDRNOTAVAIL;
-			break;
-		}
-		if (optname == MCAST_BLOCK_SOURCE) {
-			omode = MCAST_EXCLUDE;
-			add = 1;
-		} else if (optname == MCAST_UNBLOCK_SOURCE) {
-			omode = MCAST_EXCLUDE;
-			add = 0;
-		} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
-			struct sockaddr_in6 *psin6;
-
-			psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
-			retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
-						     &psin6->sin6_addr,
-						     MCAST_INCLUDE);
-			/* prior join w/ different source is ok */
-			if (retv && retv != -EADDRINUSE)
-				break;
-			omode = MCAST_INCLUDE;
-			add = 1;
-		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
-			omode = MCAST_INCLUDE;
-			add = 0;
-		}
-		retv = ip6_mc_source(add, omode, sk, &greqs);
+		retv = do_ipv6_mcast_group_source(sk, optname, &greqs);
 		break;
 	}
 	case MCAST_MSFILTER:
-- 
cgit v1.2.3


From b212c322c8d73c48062340dc4cbe150c4ce97fb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Apr 2020 11:37:02 -0400
Subject: handle the group_source_req options directly

Native ->setsockopt() handling of these options (MCAST_..._SOURCE_GROUP
and MCAST_{,UN}BLOCK_SOURCE) consists of copyin + call of a helper that
does the actual work.  The only change needed for ->compat_setsockopt()
is a slightly different copyin - the helpers can be reused as-is.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/ip_sockglue.c   | 23 +++++++++++++++++++++--
 net/ipv6/ipv6_sockglue.c | 23 +++++++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 7f065a68664e..a2469bc57cfe 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1322,8 +1322,27 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
-		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
-			ip_setsockopt);
+	{
+		struct compat_group_source_req __user *gsr32 = (void __user *)optval;
+		struct group_source_req greqs;
+
+		if (optlen != sizeof(struct compat_group_source_req))
+			return -EINVAL;
+
+		if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) ||
+		    copy_from_user(&greqs.gsr_group, &gsr32->gsr_group,
+				sizeof(greqs.gsr_group)) ||
+		    copy_from_user(&greqs.gsr_source, &gsr32->gsr_source,
+				sizeof(greqs.gsr_source)))
+			return -EFAULT;
+
+		rtnl_lock();
+		lock_sock(sk);
+		err = do_mcast_group_source(sk, optname, &greqs);
+		release_sock(sk);
+		rtnl_unlock();
+		return err;
+	}
 	case MCAST_MSFILTER:
 	{
 		const int size0 = offsetof(struct compat_group_filter, gf_slist);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index bb049feeb787..e10258c2210e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1015,8 +1015,27 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
-		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
-			ipv6_setsockopt);
+	{
+		struct compat_group_source_req __user *gsr32 = (void __user *)optval;
+		struct group_source_req greqs;
+
+		if (optlen < sizeof(struct compat_group_source_req))
+			return -EINVAL;
+
+		if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) ||
+		    copy_from_user(&greqs.gsr_group, &gsr32->gsr_group,
+				sizeof(greqs.gsr_group)) ||
+		    copy_from_user(&greqs.gsr_source, &gsr32->gsr_source,
+				sizeof(greqs.gsr_source)))
+			return -EFAULT;
+
+		rtnl_lock();
+		lock_sock(sk);
+		err = do_ipv6_mcast_group_source(sk, optname, &greqs);
+		release_sock(sk);
+		rtnl_unlock();
+		return err;
+	}
 	case MCAST_MSFILTER:
 	{
 		const int size0 = offsetof(struct compat_group_filter, gf_slist);
-- 
cgit v1.2.3


From bbced07d9952ca290e8de3957c75b8b401d7a867 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Apr 2020 11:37:02 -0400
Subject: get rid of compat_mc_setsockopt()

not used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/compat.h |  4 ---
 net/compat.c         | 90 ----------------------------------------------------
 2 files changed, 94 deletions(-)

(limited to 'net')

diff --git a/include/net/compat.h b/include/net/compat.h
index d714076d63d5..f241666117d8 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -67,10 +67,6 @@ int put_cmsg_compat(struct msghdr*, int, int, int, void *);
 int cmsghdr_from_user_compat_to_kern(struct msghdr *, struct sock *,
 				     unsigned char *, int);
 
-int compat_mc_setsockopt(struct sock *, int, int, char __user *, unsigned int,
-			 int (*)(struct sock *, int, int, char __user *,
-				 unsigned int));
-
 struct compat_group_req {
 	__u32				 gr_interface;
 	struct __kernel_sockaddr_storage gr_group
diff --git a/net/compat.c b/net/compat.c
index 7bdfda2b382a..afd7b444e0bf 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -448,96 +448,6 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
 	return __compat_sys_getsockopt(fd, level, optname, optval, optlen);
 }
 
-#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
-			sizeof(struct __kernel_sockaddr_storage))
-
-int compat_mc_setsockopt(struct sock *sock, int level, int optname,
-	char __user *optval, unsigned int optlen,
-	int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
-{
-	char __user	*koptval = optval;
-	int		koptlen = optlen;
-
-	switch (optname) {
-	case MCAST_JOIN_GROUP:
-	case MCAST_LEAVE_GROUP:
-	{
-		struct compat_group_req __user *gr32 = (void __user *)optval;
-		struct group_req __user *kgr =
-			compat_alloc_user_space(sizeof(struct group_req));
-		u32 interface;
-
-		if (!access_ok(gr32, sizeof(*gr32)) ||
-		    !access_ok(kgr, sizeof(struct group_req)) ||
-		    __get_user(interface, &gr32->gr_interface) ||
-		    __put_user(interface, &kgr->gr_interface) ||
-		    copy_in_user(&kgr->gr_group, &gr32->gr_group,
-				sizeof(kgr->gr_group)))
-			return -EFAULT;
-		koptval = (char __user *)kgr;
-		koptlen = sizeof(struct group_req);
-		break;
-	}
-	case MCAST_JOIN_SOURCE_GROUP:
-	case MCAST_LEAVE_SOURCE_GROUP:
-	case MCAST_BLOCK_SOURCE:
-	case MCAST_UNBLOCK_SOURCE:
-	{
-		struct compat_group_source_req __user *gsr32 = (void __user *)optval;
-		struct group_source_req __user *kgsr = compat_alloc_user_space(
-			sizeof(struct group_source_req));
-		u32 interface;
-
-		if (!access_ok(gsr32, sizeof(*gsr32)) ||
-		    !access_ok(kgsr,
-			sizeof(struct group_source_req)) ||
-		    __get_user(interface, &gsr32->gsr_interface) ||
-		    __put_user(interface, &kgsr->gsr_interface) ||
-		    copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group,
-				sizeof(kgsr->gsr_group)) ||
-		    copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source,
-				sizeof(kgsr->gsr_source)))
-			return -EFAULT;
-		koptval = (char __user *)kgsr;
-		koptlen = sizeof(struct group_source_req);
-		break;
-	}
-	case MCAST_MSFILTER:
-	{
-		struct compat_group_filter __user *gf32 = (void __user *)optval;
-		struct group_filter __user *kgf;
-		u32 interface, fmode, numsrc;
-
-		if (!access_ok(gf32, __COMPAT_GF0_SIZE) ||
-		    __get_user(interface, &gf32->gf_interface) ||
-		    __get_user(fmode, &gf32->gf_fmode) ||
-		    __get_user(numsrc, &gf32->gf_numsrc))
-			return -EFAULT;
-		koptlen = optlen + sizeof(struct group_filter) -
-				sizeof(struct compat_group_filter);
-		if (koptlen < GROUP_FILTER_SIZE(numsrc))
-			return -EINVAL;
-		kgf = compat_alloc_user_space(koptlen);
-		if (!access_ok(kgf, koptlen) ||
-		    __put_user(interface, &kgf->gf_interface) ||
-		    __put_user(fmode, &kgf->gf_fmode) ||
-		    __put_user(numsrc, &kgf->gf_numsrc) ||
-		    copy_in_user(&kgf->gf_group, &gf32->gf_group,
-				sizeof(kgf->gf_group)) ||
-		    (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist,
-				numsrc * sizeof(kgf->gf_slist[0]))))
-			return -EFAULT;
-		koptval = (char __user *)kgf;
-		break;
-	}
-
-	default:
-		break;
-	}
-	return setsockopt(sock, level, optname, koptval, koptlen);
-}
-EXPORT_SYMBOL(compat_mc_setsockopt);
-
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
 static unsigned char nas[21] = {
-- 
cgit v1.2.3


From 38c53ca3c114fa2a7030f0d1f54feaf044957609 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 23 Apr 2020 10:19:04 -0400
Subject: batadv_socket_read(): get rid of pointless access_ok()

address is passed only to copy_to_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/batman-adv/icmp_socket.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index ccb535c77e5d..8bdabc03b0b2 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -135,9 +135,6 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf,
 	if (!buf || count < sizeof(struct batadv_icmp_packet))
 		return -EINVAL;
 
-	if (!access_ok(buf, count))
-		return -EFAULT;
-
 	error = wait_event_interruptible(socket_client->queue_wait,
 					 socket_client->queue_len);
 
-- 
cgit v1.2.3


From 8c2348e36af0da79477b0726781da297263269a4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 May 2020 17:20:49 -0400
Subject: atm: separate ATM_GETNAMES handling from the rest of atm_dev_ioctl()

atm_dev_ioctl() does copyin in two different ways - one for
ATM_GETNAMES, another for everything else.  Start with separating
the former into a new helper (atm_getnames()).  The next step
will be to lift the copyin into the callers.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/atm/ioctl.c     |  6 +++-
 net/atm/resources.c | 88 +++++++++++++++++++++++++++--------------------------
 net/atm/resources.h |  1 +
 3 files changed, 51 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index d955b683aa7c..0b4b07740fe4 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -162,7 +162,11 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
 	if (error != -ENOIOCTLCMD)
 		goto done;
 
-	error = atm_dev_ioctl(cmd, argp, compat);
+	if (cmd == ATM_GETNAMES) {
+		error = atm_getnames(argp, compat);
+	} else {
+		error = atm_dev_ioctl(cmd, argp, compat);
+	}
 
 done:
 	return error;
diff --git a/net/atm/resources.c b/net/atm/resources.c
index 889349c6d90d..a2ab75929eec 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -193,61 +193,63 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
 	return error ? -EFAULT : 0;
 }
 
-int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+int atm_getnames(void __user *arg, int compat)
 {
 	void __user *buf;
-	int error, len, number, size = 0;
+	int error, len, size = 0;
 	struct atm_dev *dev;
 	struct list_head *p;
 	int *tmp_buf, *tmp_p;
-	int __user *sioc_len;
 	int __user *iobuf_len;
 
-	switch (cmd) {
-	case ATM_GETNAMES:
-		if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+	if (IS_ENABLED(CONFIG_COMPAT) && compat) {
 #ifdef CONFIG_COMPAT
-			struct compat_atm_iobuf __user *ciobuf = arg;
-			compat_uptr_t cbuf;
-			iobuf_len = &ciobuf->length;
-			if (get_user(cbuf, &ciobuf->buffer))
-				return -EFAULT;
-			buf = compat_ptr(cbuf);
+		struct compat_atm_iobuf __user *ciobuf = arg;
+		compat_uptr_t cbuf;
+		iobuf_len = &ciobuf->length;
+		if (get_user(cbuf, &ciobuf->buffer))
+			return -EFAULT;
+		buf = compat_ptr(cbuf);
 #endif
-		} else {
-			struct atm_iobuf __user *iobuf = arg;
-			iobuf_len = &iobuf->length;
-			if (get_user(buf, &iobuf->buffer))
-				return -EFAULT;
-		}
-		if (get_user(len, iobuf_len))
+	} else {
+		struct atm_iobuf __user *iobuf = arg;
+		iobuf_len = &iobuf->length;
+		if (get_user(buf, &iobuf->buffer))
 			return -EFAULT;
-		mutex_lock(&atm_dev_mutex);
-		list_for_each(p, &atm_devs)
-			size += sizeof(int);
-		if (size > len) {
-			mutex_unlock(&atm_dev_mutex);
-			return -E2BIG;
-		}
-		tmp_buf = kmalloc(size, GFP_ATOMIC);
-		if (!tmp_buf) {
-			mutex_unlock(&atm_dev_mutex);
-			return -ENOMEM;
-		}
-		tmp_p = tmp_buf;
-		list_for_each(p, &atm_devs) {
-			dev = list_entry(p, struct atm_dev, dev_list);
-			*tmp_p++ = dev->number;
-		}
+	}
+	if (get_user(len, iobuf_len))
+		return -EFAULT;
+	mutex_lock(&atm_dev_mutex);
+	list_for_each(p, &atm_devs)
+		size += sizeof(int);
+	if (size > len) {
 		mutex_unlock(&atm_dev_mutex);
-		error = ((copy_to_user(buf, tmp_buf, size)) ||
-			 put_user(size, iobuf_len))
-			? -EFAULT : 0;
-		kfree(tmp_buf);
-		return error;
-	default:
-		break;
+		return -E2BIG;
 	}
+	tmp_buf = kmalloc(size, GFP_ATOMIC);
+	if (!tmp_buf) {
+		mutex_unlock(&atm_dev_mutex);
+		return -ENOMEM;
+	}
+	tmp_p = tmp_buf;
+	list_for_each(p, &atm_devs) {
+		dev = list_entry(p, struct atm_dev, dev_list);
+		*tmp_p++ = dev->number;
+	}
+	mutex_unlock(&atm_dev_mutex);
+	error = ((copy_to_user(buf, tmp_buf, size)) ||
+		 put_user(size, iobuf_len))
+		? -EFAULT : 0;
+	kfree(tmp_buf);
+	return error;
+}
+
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+{
+	void __user *buf;
+	int error, len, number, size = 0;
+	struct atm_dev *dev;
+	int __user *sioc_len;
 
 	if (IS_ENABLED(CONFIG_COMPAT) && compat) {
 #ifdef CONFIG_COMPAT
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 048232e4d4c6..18f8e5948ce4 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -14,6 +14,7 @@
 extern struct list_head atm_devs;
 extern struct mutex atm_dev_mutex;
 
+int atm_getnames(void __user *arg, int compat);
 int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
 
 
-- 
cgit v1.2.3


From a3929484af75ee524419edbbc4e9ce012c3d67c9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 May 2020 17:34:20 -0400
Subject: atm: move copyin from atm_getnames() into the caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/atm/ioctl.c     | 19 ++++++++++++++++++-
 net/atm/resources.c | 19 +------------------
 net/atm/resources.h |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 0b4b07740fe4..e239cebf48da 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -56,6 +56,8 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
 	int error;
 	struct list_head *pos;
 	void __user *argp = (void __user *)arg;
+	void __user *buf;
+	int __user *len;
 
 	vcc = ATM_SD(sock);
 	switch (cmd) {
@@ -163,7 +165,22 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
 		goto done;
 
 	if (cmd == ATM_GETNAMES) {
-		error = atm_getnames(argp, compat);
+		if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+			struct compat_atm_iobuf __user *ciobuf = argp;
+			compat_uptr_t cbuf;
+			len = &ciobuf->length;
+			if (get_user(cbuf, &ciobuf->buffer))
+				return -EFAULT;
+			buf = compat_ptr(cbuf);
+#endif
+		} else {
+			struct atm_iobuf __user *iobuf = argp;
+			len = &iobuf->length;
+			if (get_user(buf, &iobuf->buffer))
+				return -EFAULT;
+		}
+		error = atm_getnames(buf, len);
 	} else {
 		error = atm_dev_ioctl(cmd, argp, compat);
 	}
diff --git a/net/atm/resources.c b/net/atm/resources.c
index a2ab75929eec..5507cc608969 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -193,30 +193,13 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
 	return error ? -EFAULT : 0;
 }
 
-int atm_getnames(void __user *arg, int compat)
+int atm_getnames(void __user *buf, int __user *iobuf_len)
 {
-	void __user *buf;
 	int error, len, size = 0;
 	struct atm_dev *dev;
 	struct list_head *p;
 	int *tmp_buf, *tmp_p;
-	int __user *iobuf_len;
 
-	if (IS_ENABLED(CONFIG_COMPAT) && compat) {
-#ifdef CONFIG_COMPAT
-		struct compat_atm_iobuf __user *ciobuf = arg;
-		compat_uptr_t cbuf;
-		iobuf_len = &ciobuf->length;
-		if (get_user(cbuf, &ciobuf->buffer))
-			return -EFAULT;
-		buf = compat_ptr(cbuf);
-#endif
-	} else {
-		struct atm_iobuf __user *iobuf = arg;
-		iobuf_len = &iobuf->length;
-		if (get_user(buf, &iobuf->buffer))
-			return -EFAULT;
-	}
 	if (get_user(len, iobuf_len))
 		return -EFAULT;
 	mutex_lock(&atm_dev_mutex);
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 18f8e5948ce4..5e2c68d37d63 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -14,7 +14,7 @@
 extern struct list_head atm_devs;
 extern struct mutex atm_dev_mutex;
 
-int atm_getnames(void __user *arg, int compat);
+int atm_getnames(void __user *buf, int __user *iobuf_len);
 int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
 
 
-- 
cgit v1.2.3


From 36085049bc0acb6f2e784f430c2cc66944a2ef07 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 May 2020 17:41:51 -0400
Subject: atm: switch do_atm_iobuf() to direct use of atm_getnames()

... and sod the compat_alloc_user_space() with its complications

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/atm/ioctl.c | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index e239cebf48da..fdd0e3434523 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -251,32 +251,13 @@ static struct {
 static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
 			unsigned long arg)
 {
-	struct atm_iobuf __user *iobuf;
-	struct compat_atm_iobuf __user *iobuf32;
+	struct compat_atm_iobuf __user *iobuf32 = compat_ptr(arg);
 	u32 data;
-	void __user *datap;
-	int len, err;
-
-	iobuf = compat_alloc_user_space(sizeof(*iobuf));
-	iobuf32 = compat_ptr(arg);
 
-	if (get_user(len, &iobuf32->length) ||
-	    get_user(data, &iobuf32->buffer))
-		return -EFAULT;
-	datap = compat_ptr(data);
-	if (put_user(len, &iobuf->length) ||
-	    put_user(datap, &iobuf->buffer))
+	if (get_user(data, &iobuf32->buffer))
 		return -EFAULT;
 
-	err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0);
-
-	if (!err) {
-		if (copy_in_user(&iobuf32->length, &iobuf->length,
-				 sizeof(int)))
-			err = -EFAULT;
-	}
-
-	return err;
+	return atm_getnames(&iobuf32->length, compat_ptr(data));
 }
 
 static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
-- 
cgit v1.2.3


From 8cacb4165985444c275a6f813f91f08479bdbfad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 May 2020 17:53:35 -0400
Subject: atm: lift copyin from atm_dev_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/atm/ioctl.c     | 25 ++++++++++++++++++++++++-
 net/atm/resources.c | 35 +++++------------------------------
 net/atm/resources.h |  4 ++--
 3 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index fdd0e3434523..52f2c77e656f 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -182,7 +182,30 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
 		}
 		error = atm_getnames(buf, len);
 	} else {
-		error = atm_dev_ioctl(cmd, argp, compat);
+		int number;
+
+		if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+			struct compat_atmif_sioc __user *csioc = argp;
+			compat_uptr_t carg;
+
+			len = &csioc->length;
+			if (get_user(carg, &csioc->arg))
+				return -EFAULT;
+			buf = compat_ptr(carg);
+			if (get_user(number, &csioc->number))
+				return -EFAULT;
+#endif
+		} else {
+			struct atmif_sioc __user *sioc = argp;
+
+			len = &sioc->length;
+			if (get_user(buf, &sioc->arg))
+				return -EFAULT;
+			if (get_user(number, &sioc->number))
+				return -EFAULT;
+		}
+		error = atm_dev_ioctl(cmd, buf, len, number, compat);
 	}
 
 done:
diff --git a/net/atm/resources.c b/net/atm/resources.c
index 5507cc608969..94bdc6527ee8 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -227,39 +227,14 @@ int atm_getnames(void __user *buf, int __user *iobuf_len)
 	return error;
 }
 
-int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+		  int number, int compat)
 {
-	void __user *buf;
-	int error, len, number, size = 0;
+	int error, len, size = 0;
 	struct atm_dev *dev;
-	int __user *sioc_len;
 
-	if (IS_ENABLED(CONFIG_COMPAT) && compat) {
-#ifdef CONFIG_COMPAT
-		struct compat_atmif_sioc __user *csioc = arg;
-		compat_uptr_t carg;
-
-		sioc_len = &csioc->length;
-		if (get_user(carg, &csioc->arg))
-			return -EFAULT;
-		buf = compat_ptr(carg);
-
-		if (get_user(len, &csioc->length))
-			return -EFAULT;
-		if (get_user(number, &csioc->number))
-			return -EFAULT;
-#endif
-	} else {
-		struct atmif_sioc __user *sioc = arg;
-
-		sioc_len = &sioc->length;
-		if (get_user(buf, &sioc->arg))
-			return -EFAULT;
-		if (get_user(len, &sioc->length))
-			return -EFAULT;
-		if (get_user(number, &sioc->number))
-			return -EFAULT;
-	}
+	if (get_user(len, sioc_len))
+		return -EFAULT;
 
 	dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d",
 				      number);
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 5e2c68d37d63..4a0839e92ff3 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -15,8 +15,8 @@ extern struct list_head atm_devs;
 extern struct mutex atm_dev_mutex;
 
 int atm_getnames(void __user *buf, int __user *iobuf_len);
-int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
-
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+		  int number, int compat);
 
 #ifdef CONFIG_PROC_FS
 
-- 
cgit v1.2.3


From 0edecc020b33f8e31d8baa80735b45e8e8434700 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 May 2020 18:13:56 -0400
Subject: atm: switch do_atmif_sioc() to direct use of atm_dev_ioctl()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/atm/ioctl.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 52f2c77e656f..838ebf0cabbf 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -286,30 +286,13 @@ static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
 static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
 			 unsigned long arg)
 {
-	struct atmif_sioc __user *sioc;
-	struct compat_atmif_sioc __user *sioc32;
+	struct compat_atmif_sioc __user *sioc32 = compat_ptr(arg);
+	int number;
 	u32 data;
-	void __user *datap;
-	int err;
 
-	sioc = compat_alloc_user_space(sizeof(*sioc));
-	sioc32 = compat_ptr(arg);
-
-	if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
-	    get_user(data, &sioc32->arg))
-		return -EFAULT;
-	datap = compat_ptr(data);
-	if (put_user(datap, &sioc->arg))
+	if (get_user(data, &sioc32->arg) || get_user(number, &sioc32->number))
 		return -EFAULT;
-
-	err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0);
-
-	if (!err) {
-		if (copy_in_user(&sioc32->length, &sioc->length,
-				 sizeof(int)))
-			err = -EFAULT;
-	}
-	return err;
+	return atm_dev_ioctl(cmd, compat_ptr(data), &sioc32->length, number, 0);
 }
 
 static int do_atm_ioctl(struct socket *sock, unsigned int cmd32,
-- 
cgit v1.2.3


From d8bed686ab96169ac80b497d1cbed89300d97f83 Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Tue, 19 May 2020 22:45:20 +0800
Subject: net: psample: Add tunnel support

Currently, psample can only send the packet bits after decapsulation.
The tunnel information is lost. Add the tunnel support.

If the sampled packet has no tunnel info, the behavior is the same as
before. If it has, add a nested metadata field named PSAMPLE_ATTR_TUNNEL
and include the tunnel subfields if applicable.

Increase the metadata length for sampled packet with the tunnel info.
If new subfields of tunnel info should be included, update the metadata
length accordingly.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/psample.h |  22 ++++++
 net/psample/psample.c        | 157 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
index ce1116cff53d..aea26ab1431c 100644
--- a/include/uapi/linux/psample.h
+++ b/include/uapi/linux/psample.h
@@ -11,6 +11,7 @@ enum {
 	PSAMPLE_ATTR_GROUP_SEQ,
 	PSAMPLE_ATTR_SAMPLE_RATE,
 	PSAMPLE_ATTR_DATA,
+	PSAMPLE_ATTR_TUNNEL,
 
 	/* commands attributes */
 	PSAMPLE_ATTR_GROUP_REFCOUNT,
@@ -25,6 +26,27 @@ enum psample_command {
 	PSAMPLE_CMD_DEL_GROUP,
 };
 
+enum psample_tunnel_key_attr {
+	PSAMPLE_TUNNEL_KEY_ATTR_ID,                 /* be64 Tunnel ID */
+	PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC,           /* be32 src IP address. */
+	PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST,           /* be32 dst IP address. */
+	PSAMPLE_TUNNEL_KEY_ATTR_TOS,                /* u8 Tunnel IP ToS. */
+	PSAMPLE_TUNNEL_KEY_ATTR_TTL,                /* u8 Tunnel IP TTL. */
+	PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT,      /* No argument, set DF. */
+	PSAMPLE_TUNNEL_KEY_ATTR_CSUM,               /* No argument. CSUM packet. */
+	PSAMPLE_TUNNEL_KEY_ATTR_OAM,                /* No argument. OAM frame.  */
+	PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS,        /* Array of Geneve options. */
+	PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC,	            /* be16 src Transport Port. */
+	PSAMPLE_TUNNEL_KEY_ATTR_TP_DST,		    /* be16 dst Transport Port. */
+	PSAMPLE_TUNNEL_KEY_ATTR_VXLAN_OPTS,	    /* Nested VXLAN opts* */
+	PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC,           /* struct in6_addr src IPv6 address. */
+	PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST,           /* struct in6_addr dst IPv6 address. */
+	PSAMPLE_TUNNEL_KEY_ATTR_PAD,
+	PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS,        /* struct erspan_metadata */
+	PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE,   /* No argument. IPV4_INFO_BRIDGE mode.*/
+	__PSAMPLE_TUNNEL_KEY_ATTR_MAX
+};
+
 /* Can be overridden at runtime by module option */
 #define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1)
 
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 6f2fbc6b9eb2..34a74043840b 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -14,6 +14,8 @@
 #include <net/genetlink.h>
 #include <net/psample.h>
 #include <linux/spinlock.h>
+#include <net/ip_tunnels.h>
+#include <net/dst_metadata.h>
 
 #define PSAMPLE_MAX_PACKET_SIZE 0xffff
 
@@ -207,10 +209,155 @@ void psample_group_put(struct psample_group *group)
 }
 EXPORT_SYMBOL_GPL(psample_group_put);
 
+static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
+			      struct ip_tunnel_info *tun_info)
+{
+	unsigned short tun_proto = ip_tunnel_info_af(tun_info);
+	const void *tun_opts = ip_tunnel_info_opts(tun_info);
+	const struct ip_tunnel_key *tun_key = &tun_info->key;
+	int tun_opts_len = tun_info->options_len;
+
+	if (tun_key->tun_flags & TUNNEL_KEY &&
+	    nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id,
+			 PSAMPLE_TUNNEL_KEY_ATTR_PAD))
+		return -EMSGSIZE;
+
+	if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE &&
+	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE))
+		return -EMSGSIZE;
+
+	switch (tun_proto) {
+	case AF_INET:
+		if (tun_key->u.ipv4.src &&
+		    nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC,
+				    tun_key->u.ipv4.src))
+			return -EMSGSIZE;
+		if (tun_key->u.ipv4.dst &&
+		    nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST,
+				    tun_key->u.ipv4.dst))
+			return -EMSGSIZE;
+		break;
+	case AF_INET6:
+		if (!ipv6_addr_any(&tun_key->u.ipv6.src) &&
+		    nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC,
+				     &tun_key->u.ipv6.src))
+			return -EMSGSIZE;
+		if (!ipv6_addr_any(&tun_key->u.ipv6.dst) &&
+		    nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST,
+				     &tun_key->u.ipv6.dst))
+			return -EMSGSIZE;
+		break;
+	}
+	if (tun_key->tos &&
+	    nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos))
+		return -EMSGSIZE;
+	if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl))
+		return -EMSGSIZE;
+	if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+		return -EMSGSIZE;
+	if ((tun_key->tun_flags & TUNNEL_CSUM) &&
+	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM))
+		return -EMSGSIZE;
+	if (tun_key->tp_src &&
+	    nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src))
+		return -EMSGSIZE;
+	if (tun_key->tp_dst &&
+	    nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst))
+		return -EMSGSIZE;
+	if ((tun_key->tun_flags & TUNNEL_OAM) &&
+	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM))
+		return -EMSGSIZE;
+	if (tun_opts_len) {
+		if (tun_key->tun_flags & TUNNEL_GENEVE_OPT &&
+		    nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+			    tun_opts_len, tun_opts))
+			return -EMSGSIZE;
+		else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT &&
+			 nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+				 tun_opts_len, tun_opts))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+static int psample_ip_tun_to_nlattr(struct sk_buff *skb,
+			    struct ip_tunnel_info *tun_info)
+{
+	struct nlattr *nla;
+	int err;
+
+	nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL);
+	if (!nla)
+		return -EMSGSIZE;
+
+	err = __psample_ip_tun_to_nlattr(skb, tun_info);
+	if (err) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	}
+
+	nla_nest_end(skb, nla);
+
+	return 0;
+}
+
+static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
+{
+	unsigned short tun_proto = ip_tunnel_info_af(tun_info);
+	const struct ip_tunnel_key *tun_key = &tun_info->key;
+	int tun_opts_len = tun_info->options_len;
+	int sum = 0;
+
+	if (tun_key->tun_flags & TUNNEL_KEY)
+		sum += nla_total_size(sizeof(u64));
+
+	if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE)
+		sum += nla_total_size(0);
+
+	switch (tun_proto) {
+	case AF_INET:
+		if (tun_key->u.ipv4.src)
+			sum += nla_total_size(sizeof(u32));
+		if (tun_key->u.ipv4.dst)
+			sum += nla_total_size(sizeof(u32));
+		break;
+	case AF_INET6:
+		if (!ipv6_addr_any(&tun_key->u.ipv6.src))
+			sum += nla_total_size(sizeof(struct in6_addr));
+		if (!ipv6_addr_any(&tun_key->u.ipv6.dst))
+			sum += nla_total_size(sizeof(struct in6_addr));
+		break;
+	}
+	if (tun_key->tos)
+		sum += nla_total_size(sizeof(u8));
+	sum += nla_total_size(sizeof(u8));	/* TTL */
+	if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		sum += nla_total_size(0);
+	if (tun_key->tun_flags & TUNNEL_CSUM)
+		sum += nla_total_size(0);
+	if (tun_key->tp_src)
+		sum += nla_total_size(sizeof(u16));
+	if (tun_key->tp_dst)
+		sum += nla_total_size(sizeof(u16));
+	if (tun_key->tun_flags & TUNNEL_OAM)
+		sum += nla_total_size(0);
+	if (tun_opts_len) {
+		if (tun_key->tun_flags & TUNNEL_GENEVE_OPT)
+			sum += nla_total_size(tun_opts_len);
+		else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT)
+			sum += nla_total_size(tun_opts_len);
+	}
+
+	return sum;
+}
+
 void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 			   u32 trunc_size, int in_ifindex, int out_ifindex,
 			   u32 sample_rate)
 {
+	struct ip_tunnel_info *tun_info;
 	struct sk_buff *nl_skb;
 	int data_len;
 	int meta_len;
@@ -224,6 +371,10 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 		   nla_total_size(sizeof(u32)) +	/* group_num */
 		   nla_total_size(sizeof(u32));		/* seq */
 
+	tun_info = skb_tunnel_info(skb);
+	if (tun_info)
+		meta_len += psample_tunnel_meta_len(tun_info);
+
 	data_len = min(skb->len, trunc_size);
 	if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
 		data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
@@ -278,6 +429,12 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 			goto error;
 	}
 
+	if (tun_info) {
+		ret = psample_ip_tun_to_nlattr(nl_skb, tun_info);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
 	genlmsg_end(nl_skb, data);
 	genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
 				PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
-- 
cgit v1.2.3


From 8066021915924f58ed338bf38208215f5a7355f6 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Wed, 20 May 2020 08:29:14 +0200
Subject: ethtool: provide UAPI for PHY Signal Quality Index (SQI)

Signal Quality Index is a mandatory value required by "OPEN Alliance
SIG" for the 100Base-T1 PHYs [1]. This indicator can be used for cable
integrity diagnostic and investigating other noise sources and
implement by at least two vendors: NXP[2] and TI[3].

[1] http://www.opensig.org/download/document/218/Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf
[2] https://www.nxp.com/docs/en/data-sheet/TJA1100.pdf
[3] https://www.ti.com/product/DP83TC811R-Q1

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  6 ++-
 include/linux/phy.h                          |  2 +
 include/uapi/linux/ethtool_netlink.h         |  2 +
 net/ethtool/linkstate.c                      | 75 +++++++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index eed46b6aa07d..7e651ea33eab 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -454,10 +454,12 @@ Request contents:
 
 Kernel response contents:
 
-  ====================================  ======  ==========================
+  ====================================  ======  ============================
   ``ETHTOOL_A_LINKSTATE_HEADER``        nested  reply header
   ``ETHTOOL_A_LINKSTATE_LINK``          bool    link state (up/down)
-  ====================================  ======  ==========================
+  ``ETHTOOL_A_LINKSTATE_SQI``           u32     Current Signal Quality Index
+  ``ETHTOOL_A_LINKSTATE_SQI_MAX``       u32     Max support SQI value
+  ====================================  ======  ============================
 
 For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns
 carrier flag provided by ``netif_carrier_ok()`` but there are drivers which
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 467aa8bf9f64..2bcdf19ed3b4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -723,6 +723,8 @@ struct phy_driver {
 			    struct ethtool_tunable *tuna,
 			    const void *data);
 	int (*set_loopback)(struct phy_device *dev, bool enable);
+	int (*get_sqi)(struct phy_device *dev);
+	int (*get_sqi_max)(struct phy_device *dev);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 2881af411f76..e6f109b76c9a 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -232,6 +232,8 @@ enum {
 	ETHTOOL_A_LINKSTATE_UNSPEC,
 	ETHTOOL_A_LINKSTATE_HEADER,		/* nest - _A_HEADER_* */
 	ETHTOOL_A_LINKSTATE_LINK,		/* u8 */
+	ETHTOOL_A_LINKSTATE_SQI,		/* u32 */
+	ETHTOOL_A_LINKSTATE_SQI_MAX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_LINKSTATE_CNT,
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index 2740cde0a182..7f47ba89054e 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -2,6 +2,7 @@
 
 #include "netlink.h"
 #include "common.h"
+#include <linux/phy.h>
 
 struct linkstate_req_info {
 	struct ethnl_req_info		base;
@@ -10,6 +11,8 @@ struct linkstate_req_info {
 struct linkstate_reply_data {
 	struct ethnl_reply_data		base;
 	int				link;
+	int				sqi;
+	int				sqi_max;
 };
 
 #define LINKSTATE_REPDATA(__reply_base) \
@@ -20,8 +23,46 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = {
 	[ETHTOOL_A_LINKSTATE_UNSPEC]		= { .type = NLA_REJECT },
 	[ETHTOOL_A_LINKSTATE_HEADER]		= { .type = NLA_NESTED },
 	[ETHTOOL_A_LINKSTATE_LINK]		= { .type = NLA_REJECT },
+	[ETHTOOL_A_LINKSTATE_SQI]		= { .type = NLA_REJECT },
+	[ETHTOOL_A_LINKSTATE_SQI_MAX]		= { .type = NLA_REJECT },
 };
 
+static int linkstate_get_sqi(struct net_device *dev)
+{
+	struct phy_device *phydev = dev->phydev;
+	int ret;
+
+	if (!phydev)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&phydev->lock);
+	if (!phydev->drv || !phydev->drv->get_sqi)
+		ret = -EOPNOTSUPP;
+	else
+		ret = phydev->drv->get_sqi(phydev);
+	mutex_unlock(&phydev->lock);
+
+	return ret;
+}
+
+static int linkstate_get_sqi_max(struct net_device *dev)
+{
+	struct phy_device *phydev = dev->phydev;
+	int ret;
+
+	if (!phydev)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&phydev->lock);
+	if (!phydev->drv || !phydev->drv->get_sqi_max)
+		ret = -EOPNOTSUPP;
+	else
+		ret = phydev->drv->get_sqi_max(phydev);
+	mutex_unlock(&phydev->lock);
+
+	return ret;
+}
+
 static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 				  struct ethnl_reply_data *reply_base,
 				  struct genl_info *info)
@@ -34,6 +75,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 	if (ret < 0)
 		return ret;
 	data->link = __ethtool_get_link(dev);
+
+	ret = linkstate_get_sqi(dev);
+	if (ret < 0 && ret != -EOPNOTSUPP)
+		return ret;
+
+	data->sqi = ret;
+
+	ret = linkstate_get_sqi_max(dev);
+	if (ret < 0 && ret != -EOPNOTSUPP)
+		return ret;
+
+	data->sqi_max = ret;
+
 	ethnl_ops_complete(dev);
 
 	return 0;
@@ -42,8 +96,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 static int linkstate_reply_size(const struct ethnl_req_info *req_base,
 				const struct ethnl_reply_data *reply_base)
 {
-	return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+	struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+	int len;
+
+	len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
 		+ 0;
+
+	if (data->sqi != -EOPNOTSUPP)
+		len += nla_total_size(sizeof(u32));
+
+	if (data->sqi_max != -EOPNOTSUPP)
+		len += nla_total_size(sizeof(u32));
+
+	return len;
 }
 
 static int linkstate_fill_reply(struct sk_buff *skb,
@@ -56,6 +121,14 @@ static int linkstate_fill_reply(struct sk_buff *skb,
 	    nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
 		return -EMSGSIZE;
 
+	if (data->sqi != -EOPNOTSUPP &&
+	    nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
+		return -EMSGSIZE;
+
+	if (data->sqi_max != -EOPNOTSUPP &&
+	    nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d20a1676df7e4c3c23d73299159811a50e4854bc Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:20:50 +0200
Subject: xsk: Move xskmap.c to net/xdp/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The XSKMAP is partly implemented by net/xdp/xsk.c. Move xskmap.c from
kernel/bpf/ to net/xdp/, which is the logical place for AF_XDP related
code. Also, move AF_XDP struct definitions, and function declarations
only used by AF_XDP internals into net/xdp/xsk.h.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-3-bjorn.topel@gmail.com
---
 include/net/xdp_sock.h |  20 ----
 kernel/bpf/Makefile    |   3 -
 kernel/bpf/xskmap.c    | 265 ------------------------------------------------
 net/xdp/Makefile       |   2 +-
 net/xdp/xsk.h          |  16 +++
 net/xdp/xskmap.c       | 267 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 284 insertions(+), 289 deletions(-)
 delete mode 100644 kernel/bpf/xskmap.c
 create mode 100644 net/xdp/xskmap.c

(limited to 'net')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 6b1137ce1692..8f3f6f5b0dfe 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -65,22 +65,12 @@ struct xdp_umem {
 	struct list_head xsk_tx_list;
 };
 
-/* Nodes are linked in the struct xdp_sock map_list field, and used to
- * track which maps a certain socket reside in.
- */
-
 struct xsk_map {
 	struct bpf_map map;
 	spinlock_t lock; /* Synchronize map updates */
 	struct xdp_sock *xsk_map[];
 };
 
-struct xsk_map_node {
-	struct list_head node;
-	struct xsk_map *map;
-	struct xdp_sock **map_entry;
-};
-
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
@@ -114,7 +104,6 @@ struct xdp_sock {
 struct xdp_buff;
 #ifdef CONFIG_XDP_SOCKETS
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 /* Used from netdev driver */
 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
 bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
@@ -133,10 +122,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-			     struct xdp_sock **map_entry);
-int xsk_map_inc(struct xsk_map *map);
-void xsk_map_put(struct xsk_map *map);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
@@ -248,11 +233,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return -ENOTSUPP;
 }
 
-static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
-{
-	return false;
-}
-
 static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
 {
 	return false;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 37b2d8620153..375b933010dd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
deleted file mode 100644
index 2cc5c8f4c800..000000000000
--- a/kernel/bpf/xskmap.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* XSKMAP used for AF_XDP sockets
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#include <linux/bpf.h>
-#include <linux/capability.h>
-#include <net/xdp_sock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-int xsk_map_inc(struct xsk_map *map)
-{
-	bpf_map_inc(&map->map);
-	return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
-	bpf_map_put(&map->map);
-}
-
-static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
-					       struct xdp_sock **map_entry)
-{
-	struct xsk_map_node *node;
-	int err;
-
-	node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
-	if (!node)
-		return ERR_PTR(-ENOMEM);
-
-	err = xsk_map_inc(map);
-	if (err) {
-		kfree(node);
-		return ERR_PTR(err);
-	}
-
-	node->map = map;
-	node->map_entry = map_entry;
-	return node;
-}
-
-static void xsk_map_node_free(struct xsk_map_node *node)
-{
-	xsk_map_put(node->map);
-	kfree(node);
-}
-
-static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
-{
-	spin_lock_bh(&xs->map_list_lock);
-	list_add_tail(&node->node, &xs->map_list);
-	spin_unlock_bh(&xs->map_list_lock);
-}
-
-static void xsk_map_sock_delete(struct xdp_sock *xs,
-				struct xdp_sock **map_entry)
-{
-	struct xsk_map_node *n, *tmp;
-
-	spin_lock_bh(&xs->map_list_lock);
-	list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
-		if (map_entry == n->map_entry) {
-			list_del(&n->node);
-			xsk_map_node_free(n);
-		}
-	}
-	spin_unlock_bh(&xs->map_list_lock);
-}
-
-static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_map_memory mem;
-	int err, numa_node;
-	struct xsk_map *m;
-	u64 size;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 ||
-	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
-		return ERR_PTR(-EINVAL);
-
-	numa_node = bpf_map_attr_numa_node(attr);
-	size = struct_size(m, xsk_map, attr->max_entries);
-
-	err = bpf_map_charge_init(&mem, size);
-	if (err < 0)
-		return ERR_PTR(err);
-
-	m = bpf_map_area_alloc(size, numa_node);
-	if (!m) {
-		bpf_map_charge_finish(&mem);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	bpf_map_init_from_attr(&m->map, attr);
-	bpf_map_charge_move(&m->map.memory, &mem);
-	spin_lock_init(&m->lock);
-
-	return &m->map;
-}
-
-static void xsk_map_free(struct bpf_map *map)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-
-	bpf_clear_redirect_map(map);
-	synchronize_net();
-	bpf_map_area_free(m);
-}
-
-static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	u32 index = key ? *(u32 *)key : U32_MAX;
-	u32 *next = next_key;
-
-	if (index >= m->map.max_entries) {
-		*next = 0;
-		return 0;
-	}
-
-	if (index == m->map.max_entries - 1)
-		return -ENOENT;
-	*next = index + 1;
-	return 0;
-}
-
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
-{
-	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
-	struct bpf_insn *insn = insn_buf;
-
-	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
-	*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
-	*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
-	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
-	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
-	*insn++ = BPF_MOV64_IMM(ret, 0);
-	return insn - insn_buf;
-}
-
-static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return __xsk_map_lookup_elem(map, *(u32 *)key);
-}
-
-static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *xs, *old_xs, **map_entry;
-	u32 i = *(u32 *)key, fd = *(u32 *)value;
-	struct xsk_map_node *node;
-	struct socket *sock;
-	int err;
-
-	if (unlikely(map_flags > BPF_EXIST))
-		return -EINVAL;
-	if (unlikely(i >= m->map.max_entries))
-		return -E2BIG;
-
-	sock = sockfd_lookup(fd, &err);
-	if (!sock)
-		return err;
-
-	if (sock->sk->sk_family != PF_XDP) {
-		sockfd_put(sock);
-		return -EOPNOTSUPP;
-	}
-
-	xs = (struct xdp_sock *)sock->sk;
-
-	if (!xsk_is_setup_for_bpf_map(xs)) {
-		sockfd_put(sock);
-		return -EOPNOTSUPP;
-	}
-
-	map_entry = &m->xsk_map[i];
-	node = xsk_map_node_alloc(m, map_entry);
-	if (IS_ERR(node)) {
-		sockfd_put(sock);
-		return PTR_ERR(node);
-	}
-
-	spin_lock_bh(&m->lock);
-	old_xs = READ_ONCE(*map_entry);
-	if (old_xs == xs) {
-		err = 0;
-		goto out;
-	} else if (old_xs && map_flags == BPF_NOEXIST) {
-		err = -EEXIST;
-		goto out;
-	} else if (!old_xs && map_flags == BPF_EXIST) {
-		err = -ENOENT;
-		goto out;
-	}
-	xsk_map_sock_add(xs, node);
-	WRITE_ONCE(*map_entry, xs);
-	if (old_xs)
-		xsk_map_sock_delete(old_xs, map_entry);
-	spin_unlock_bh(&m->lock);
-	sockfd_put(sock);
-	return 0;
-
-out:
-	spin_unlock_bh(&m->lock);
-	sockfd_put(sock);
-	xsk_map_node_free(node);
-	return err;
-}
-
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *old_xs, **map_entry;
-	int k = *(u32 *)key;
-
-	if (k >= map->max_entries)
-		return -EINVAL;
-
-	spin_lock_bh(&m->lock);
-	map_entry = &m->xsk_map[k];
-	old_xs = xchg(map_entry, NULL);
-	if (old_xs)
-		xsk_map_sock_delete(old_xs, map_entry);
-	spin_unlock_bh(&m->lock);
-
-	return 0;
-}
-
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-			     struct xdp_sock **map_entry)
-{
-	spin_lock_bh(&map->lock);
-	if (READ_ONCE(*map_entry) == xs) {
-		WRITE_ONCE(*map_entry, NULL);
-		xsk_map_sock_delete(xs, map_entry);
-	}
-	spin_unlock_bh(&map->lock);
-}
-
-const struct bpf_map_ops xsk_map_ops = {
-	.map_alloc = xsk_map_alloc,
-	.map_free = xsk_map_free,
-	.map_get_next_key = xsk_map_get_next_key,
-	.map_lookup_elem = xsk_map_lookup_elem,
-	.map_gen_lookup = xsk_map_gen_lookup,
-	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
-	.map_update_elem = xsk_map_update_elem,
-	.map_delete_elem = xsk_map_delete_elem,
-	.map_check_btf = map_check_no_btf,
-};
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 71e2bdafb2ce..90b5460d6166 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 4cfd106bdb53..d6a0979050e6 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -17,9 +17,25 @@ struct xdp_mmap_offsets_v1 {
 	struct xdp_ring_offset_v1 cr;
 };
 
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+	struct list_head node;
+	struct xsk_map *map;
+	struct xdp_sock **map_entry;
+};
+
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
 {
 	return (struct xdp_sock *)sk;
 }
 
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+			     struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
 #endif /* XSK_H_ */
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
new file mode 100644
index 000000000000..1dc7208c71ba
--- /dev/null
+++ b/net/xdp/xskmap.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "xsk.h"
+
+int xsk_map_inc(struct xsk_map *map)
+{
+	bpf_map_inc(&map->map);
+	return 0;
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+	bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+					       struct xdp_sock **map_entry)
+{
+	struct xsk_map_node *node;
+	int err;
+
+	node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	err = xsk_map_inc(map);
+	if (err) {
+		kfree(node);
+		return ERR_PTR(err);
+	}
+
+	node->map = map;
+	node->map_entry = map_entry;
+	return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+	xsk_map_put(node->map);
+	kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+	spin_lock_bh(&xs->map_list_lock);
+	list_add_tail(&node->node, &xs->map_list);
+	spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+				struct xdp_sock **map_entry)
+{
+	struct xsk_map_node *n, *tmp;
+
+	spin_lock_bh(&xs->map_list_lock);
+	list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+		if (map_entry == n->map_entry) {
+			list_del(&n->node);
+			xsk_map_node_free(n);
+		}
+	}
+	spin_unlock_bh(&xs->map_list_lock);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map_memory mem;
+	int err, numa_node;
+	struct xsk_map *m;
+	u64 size;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	numa_node = bpf_map_attr_numa_node(attr);
+	size = struct_size(m, xsk_map, attr->max_entries);
+
+	err = bpf_map_charge_init(&mem, size);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	m = bpf_map_area_alloc(size, numa_node);
+	if (!m) {
+		bpf_map_charge_finish(&mem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	bpf_map_init_from_attr(&m->map, attr);
+	bpf_map_charge_move(&m->map.memory, &mem);
+	spin_lock_init(&m->lock);
+
+	return &m->map;
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	bpf_clear_redirect_map(map);
+	synchronize_net();
+	bpf_map_area_free(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+	struct bpf_insn *insn = insn_buf;
+
+	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+	*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+	*insn++ = BPF_MOV64_IMM(ret, 0);
+	return insn - insn_buf;
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs, *old_xs, **map_entry;
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xsk_map_node *node;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	if (!xsk_is_setup_for_bpf_map(xs)) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	map_entry = &m->xsk_map[i];
+	node = xsk_map_node_alloc(m, map_entry);
+	if (IS_ERR(node)) {
+		sockfd_put(sock);
+		return PTR_ERR(node);
+	}
+
+	spin_lock_bh(&m->lock);
+	old_xs = READ_ONCE(*map_entry);
+	if (old_xs == xs) {
+		err = 0;
+		goto out;
+	} else if (old_xs && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto out;
+	} else if (!old_xs && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto out;
+	}
+	xsk_map_sock_add(xs, node);
+	WRITE_ONCE(*map_entry, xs);
+	if (old_xs)
+		xsk_map_sock_delete(old_xs, map_entry);
+	spin_unlock_bh(&m->lock);
+	sockfd_put(sock);
+	return 0;
+
+out:
+	spin_unlock_bh(&m->lock);
+	sockfd_put(sock);
+	xsk_map_node_free(node);
+	return err;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *old_xs, **map_entry;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	spin_lock_bh(&m->lock);
+	map_entry = &m->xsk_map[k];
+	old_xs = xchg(map_entry, NULL);
+	if (old_xs)
+		xsk_map_sock_delete(old_xs, map_entry);
+	spin_unlock_bh(&m->lock);
+
+	return 0;
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+			     struct xdp_sock **map_entry)
+{
+	spin_lock_bh(&map->lock);
+	if (READ_ONCE(*map_entry) == xs) {
+		WRITE_ONCE(*map_entry, NULL);
+		xsk_map_sock_delete(xs, map_entry);
+	}
+	spin_unlock_bh(&map->lock);
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_gen_lookup = xsk_map_gen_lookup,
+	.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+	.map_check_btf = map_check_no_btf,
+};
-- 
cgit v1.2.3


From a71506a4fda92a39c8ece119876bc7ccde6d3c9d Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 20 May 2020 21:20:51 +0200
Subject: xsk: Move driver interface to xdp_sock_drv.h

Move the AF_XDP zero-copy driver interface to its own include file
called xdp_sock_drv.h. This, hopefully, will make it more clear for
NIC driver implementors to know what functions to use for zero-copy
support.

v4->v5: Fix -Wmissing-prototypes by include header file. (Jakub)

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-4-bjorn.topel@gmail.com
---
 drivers/net/ethernet/intel/i40e/i40e_main.c        |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c         |   2 +-
 drivers/net/ethernet/intel/ice/ice_xsk.c           |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c       |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h    |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.h    |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/umem.c  |   2 +-
 include/net/xdp_sock.h                             | 214 +-------------------
 include/net/xdp_sock_drv.h                         | 217 +++++++++++++++++++++
 net/ethtool/channels.c                             |   2 +-
 net/ethtool/ioctl.c                                |   2 +-
 net/xdp/xdp_umem.h                                 |   2 +-
 net/xdp/xsk.c                                      |   2 +-
 net/xdp/xsk_queue.c                                |   1 +
 15 files changed, 238 insertions(+), 218 deletions(-)
 create mode 100644 include/net/xdp_sock_drv.h

(limited to 'net')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2a037ec244b9..d6b2db4f2c65 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11,7 +11,7 @@
 #include "i40e_diag.h"
 #include "i40e_xsk.h"
 #include <net/udp_tunnel.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 /* All i40e tracepoints are defined by the include below, which
  * must be included exactly once across the whole kernel with
  * CREATE_TRACE_POINTS defined
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2b9184aead5f..d8b0be29099a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -2,7 +2,7 @@
 /* Copyright(c) 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "i40e.h"
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 23e5515d4527..70e204307a93 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -2,7 +2,7 @@
 /* Copyright (c) 2019, Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 #include "ice.h"
 #include "ice_base.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index a656ee9a1fae..82e4effae704 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -2,7 +2,7 @@
 /* Copyright(c) 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "ixgbe.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 761c8979bd41..3507d23f0eb8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -31,7 +31,7 @@
  */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "en/xdp.h"
 #include "en/params.h"
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index cab0e93497ae..a8e11adbf426 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,7 +5,7 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* RX data path */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index 79b487d89757..39fa0a705856 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,7 +5,7 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* TX data path */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
index 4baaa5788320..5e49fdb564b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2019 Mellanox Technologies. */
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "umem.h"
 #include "setup.h"
 #include "en/params.h"
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 8f3f6f5b0dfe..6a986dcbc336 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -15,6 +15,7 @@
 
 struct net_device;
 struct xsk_queue;
+struct xdp_buff;
 
 /* Masks for xdp_umem_page flags.
  * The low 12-bits of the addr will be 0 since this is the page address, so we
@@ -101,27 +102,9 @@ struct xdp_sock {
 	spinlock_t map_list_lock;
 };
 
-struct xdp_buff;
 #ifdef CONFIG_XDP_SOCKETS
-int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-/* Used from netdev driver */
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_release_addr(struct xdp_umem *umem);
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
-					  struct xdp_umem_fq_reuse *newq);
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
@@ -153,131 +136,24 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
 	return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
 }
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-	unsigned long page_addr;
-
-	addr = xsk_umem_add_offset_to_addr(addr);
-	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
-
-	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-	addr = xsk_umem_add_offset_to_addr(addr);
-
-	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
-}
-
-/* Reuse-queue aware version of FILL queue helpers */
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (rq->length >= cnt)
-		return true;
-
-	return xsk_umem_has_addrs(umem, cnt - rq->length);
-}
-
-static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (!rq->length)
-		return xsk_umem_peek_addr(umem, addr);
-
-	*addr = rq->handles[rq->length - 1];
-	return addr;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (!rq->length)
-		xsk_umem_release_addr(umem);
-	else
-		rq->length--;
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	rq->handles[rq->length++] = addr;
-}
-
-/* Handle the offset appropriately depending on aligned or unaligned mode.
- * For unaligned mode, we store the offset in the upper 16-bits of the address.
- * For aligned mode, we simply add the offset to the address.
- */
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
-					 u64 offset)
-{
-	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
-		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
-	else
-		return address + offset;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-	return umem->chunk_size_nohr;
-}
-
 #else
+
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	return -ENOTSUPP;
 }
 
-static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
-	return false;
-}
-
-static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
-	return NULL;
-}
-
-static inline void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
-{
-}
-
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
-				       struct xdp_desc *desc)
-{
-	return false;
-}
-
-static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
-{
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	return NULL;
+	return -EOPNOTSUPP;
 }
 
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
-	struct xdp_umem *umem,
-	struct xdp_umem_fq_reuse *newq)
-{
-	return NULL;
-}
-static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+static inline void __xsk_map_flush(void)
 {
 }
 
-static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
-						     u16 queue_id)
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
 {
 	return NULL;
 }
@@ -297,80 +173,6 @@ static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
 	return 0;
 }
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-	return NULL;
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-	return 0;
-}
-
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-	return false;
-}
-
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-	return NULL;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-}
-
-static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
-{
-	return false;
-}
-
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
-					 u64 offset)
-{
-	return 0;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-	return 0;
-}
-
-static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void __xsk_map_flush(void)
-{
-}
-
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-						     u32 key)
-{
-	return NULL;
-}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
new file mode 100644
index 000000000000..d67f2361937a
--- /dev/null
+++ b/include/net/xdp_sock_drv.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Interface for implementing AF_XDP zero-copy support in drivers.
+ * Copyright(c) 2020 Intel Corporation.
+ */
+
+#ifndef _LINUX_XDP_SOCK_DRV_H
+#define _LINUX_XDP_SOCK_DRV_H
+
+#include <net/xdp_sock.h>
+
+#ifdef CONFIG_XDP_SOCKETS
+
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
+bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_release_addr(struct xdp_umem *umem);
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+					  struct xdp_umem_fq_reuse *newq);
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+	unsigned long page_addr;
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
+
+	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	addr = xsk_umem_add_offset_to_addr(addr);
+
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
+}
+
+/* Reuse-queue aware version of FILL queue helpers */
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (rq->length >= cnt)
+		return true;
+
+	return xsk_umem_has_addrs(umem, cnt - rq->length);
+}
+
+static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		return xsk_umem_peek_addr(umem, addr);
+
+	*addr = rq->handles[rq->length - 1];
+	return addr;
+}
+
+static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		xsk_umem_release_addr(umem);
+	else
+		rq->length--;
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	rq->handles[rq->length++] = addr;
+}
+
+/* Handle the offset appropriately depending on aligned or unaligned mode.
+ * For unaligned mode, we store the offset in the upper 16-bits of the address.
+ * For aligned mode, we simply add the offset to the address.
+ */
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
+					 u64 offset)
+{
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+	else
+		return address + offset;
+}
+
+static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
+{
+	return umem->chunk_size_nohr;
+}
+
+#else
+
+static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+	return false;
+}
+
+static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return NULL;
+}
+
+static inline void xsk_umem_release_addr(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+}
+
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+				       struct xdp_desc *desc)
+{
+	return false;
+}
+
+static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+}
+
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+	return NULL;
+}
+
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
+	struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq)
+{
+	return NULL;
+}
+
+static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+}
+
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+						     u16 queue_id)
+{
+	return NULL;
+}
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+	return NULL;
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return 0;
+}
+
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+	return false;
+}
+
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+	return NULL;
+}
+
+static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+}
+
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+	return false;
+}
+
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
+					 u64 offset)
+{
+	return 0;
+}
+
+static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
+{
+	return 0;
+}
+
+#endif /* CONFIG_XDP_SOCKETS */
+
+#endif /* _LINUX_XDP_SOCK_DRV_H */
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 389924b65d05..658a8580b464 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 #include "netlink.h"
 #include "common.h"
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 52102ab1709b..74892623bacd 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -24,7 +24,7 @@
 #include <linux/sched/signal.h>
 #include <linux/net.h>
 #include <net/devlink.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <linux/ethtool_netlink.h>
 #include <generated/utsrelease.h>
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index a63a9fb251f5..32067fe98f65 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -6,7 +6,7 @@
 #ifndef XDP_UMEM_H_
 #define XDP_UMEM_H_
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u16 queue_id, u16 flags);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 45ffd67b367d..8bda654e82ec 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -22,7 +22,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "xsk_queue.h"
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 57fb81bd593c..554b1ebb4d02 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -6,6 +6,7 @@
 #include <linux/log2.h>
 #include <linux/slab.h>
 #include <linux/overflow.h>
+#include <net/xdp_sock_drv.h>
 
 #include "xsk_queue.h"
 
-- 
cgit v1.2.3


From 89e4a376e3a3dab639a3947a6c7cf5d461d1aa4c Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:20:52 +0200
Subject: xsk: Move defines only used by AF_XDP internals to xsk.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the XSK_NEXT_PG_CONTIG_{MASK,SHIFT}, and
XDP_UMEM_USES_NEED_WAKEUP defines from xdp_sock.h to the AF_XDP
internal xsk.h file. Also, start using the BIT{,_ULL} macro instead of
explicit shifts.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-5-bjorn.topel@gmail.com
---
 include/net/xdp_sock.h | 14 --------------
 net/xdp/xsk.h          | 14 ++++++++++++++
 net/xdp/xsk_queue.h    |  2 ++
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 6a986dcbc336..fb7fe3060175 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -17,13 +17,6 @@ struct net_device;
 struct xsk_queue;
 struct xdp_buff;
 
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
-
 struct xdp_umem_page {
 	void *addr;
 	dma_addr_t dma;
@@ -35,13 +28,6 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
-
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index d6a0979050e6..455ddd480f3d 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,20 @@
 #ifndef XSK_H_
 #define XSK_H_
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
+
 struct xdp_ring_offset_v1 {
 	__u64 producer;
 	__u64 consumer;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 648733ec24ac..a322a7dac58c 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -10,6 +10,8 @@
 #include <linux/if_xdp.h>
 #include <net/xdp_sock.h>
 
+#include "xsk.h"
+
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
 	u32 consumer ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From 2b43470add8c8ff1e1ee28dffc5c5df97e955d09 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:20:53 +0200
Subject: xsk: Introduce AF_XDP buffer allocation API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to simplify AF_XDP zero-copy enablement for NIC driver
developers, a new AF_XDP buffer allocation API is added. The
implementation is based on a single core (single producer/consumer)
buffer pool for the AF_XDP UMEM.

A buffer is allocated using the xsk_buff_alloc() function, and
returned using xsk_buff_free(). If a buffer is disassociated with the
pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is
said to be released. Currently, the release function is only used by
the AF_XDP internals and not visible to the driver.

Drivers using this API should register the XDP memory model with the
new MEM_TYPE_XSK_BUFF_POOL type.

The API is defined in net/xdp_sock_drv.h.

The buffer type is struct xdp_buff, and follows the lifetime of
regular xdp_buffs, i.e.  the lifetime of an xdp_buff is restricted to
a NAPI context. In other words, the API is not replacing xdp_frames.

In addition to introducing the API and implementations, the AF_XDP
core is migrated to use the new APIs.

rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test
         robot)
         Added headroom/chunk size getter. (Maxim/Björn)

v1->v2: Swapped SoBs. (Maxim)

v2->v3: Initialize struct xdp_buff member frame_sz. (Björn)
        Add API to query the DMA address of a frame. (Maxim)
        Do DMA sync for CPU till the end of the frame to handle
        possible growth (frame_sz). (Maxim)

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com
---
 include/net/xdp.h           |   4 +-
 include/net/xdp_sock.h      |   2 +
 include/net/xdp_sock_drv.h  | 164 ++++++++++++++++
 include/net/xsk_buff_pool.h |  56 ++++++
 include/trace/events/xdp.h  |   3 +-
 net/core/xdp.c              |  14 +-
 net/xdp/Makefile            |   1 +
 net/xdp/xdp_umem.c          |  19 +-
 net/xdp/xsk.c               | 147 +++++---------
 net/xdp/xsk_buff_pool.c     | 467 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_diag.c          |   2 +-
 net/xdp/xsk_queue.h         |  59 ++++--
 12 files changed, 819 insertions(+), 119 deletions(-)
 create mode 100644 include/net/xsk_buff_pool.h
 create mode 100644 net/xdp/xsk_buff_pool.c

(limited to 'net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3094fccf5a88..f432134c7c00 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -40,6 +40,7 @@ enum xdp_mem_type {
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
 	MEM_TYPE_PAGE_POOL,
 	MEM_TYPE_ZERO_COPY,
+	MEM_TYPE_XSK_BUFF_POOL,
 	MEM_TYPE_MAX,
 };
 
@@ -119,7 +120,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
-	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY ||
+	    xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
 		return xdp_convert_zc_to_xdp_frame(xdp);
 
 	/* Assure headroom is available for storing info */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index fb7fe3060175..6e7265f63c04 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -31,11 +31,13 @@ struct xdp_umem_fq_reuse {
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
+	struct xsk_buff_pool *pool;
 	struct xdp_umem_page *pages;
 	u64 chunk_mask;
 	u64 size;
 	u32 headroom;
 	u32 chunk_size_nohr;
+	u32 chunk_size;
 	struct user_struct *user;
 	refcount_t users;
 	struct work_struct work;
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index d67f2361937a..7752c8663d1b 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -7,6 +7,7 @@
 #define _LINUX_XDP_SOCK_DRV_H
 
 #include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
 
 #ifdef CONFIG_XDP_SOCKETS
 
@@ -101,6 +102,94 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
 	return umem->chunk_size_nohr;
 }
 
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+	return XDP_PACKET_HEADROOM + umem->headroom;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+	return umem->chunk_size;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+	return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+					 struct xdp_rxq_info *rxq)
+{
+	xp_set_rxq_info(umem->pool, rxq);
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+				      unsigned long attrs)
+{
+	xp_dma_unmap(umem->pool, attrs);
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+				   unsigned long attrs)
+{
+	return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+	return xp_get_dma(xskb);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+	return xp_get_frame_dma(xskb);
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+	return xp_alloc(umem->pool);
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+	return xp_can_alloc(umem->pool, count);
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+	xp_free(xskb);
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return xp_raw_get_dma(umem->pool, addr);
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+	return xp_raw_get_data(umem->pool, addr);
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+	xp_dma_sync_for_cpu(xskb);
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+						    dma_addr_t dma,
+						    size_t size)
+{
+	xp_dma_sync_for_device(umem->pool, dma, size);
+}
+
 #else
 
 static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
@@ -212,6 +301,81 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
 	return 0;
 }
 
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+	return 0;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+	return 0;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+	return 0;
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+					 struct xdp_rxq_info *rxq)
+{
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+				      unsigned long attrs)
+{
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+				   unsigned long attrs)
+{
+	return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+	return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+	return 0;
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+	return NULL;
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+	return false;
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return 0;
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+	return NULL;
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+						    dma_addr_t dma,
+						    size_t size)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_DRV_H */
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
new file mode 100644
index 000000000000..9f221b36e405
--- /dev/null
+++ b/include/net/xsk_buff_pool.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef XSK_BUFF_POOL_H_
+#define XSK_BUFF_POOL_H_
+
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <net/xdp.h>
+
+struct xsk_buff_pool;
+struct xdp_rxq_info;
+struct xsk_queue;
+struct xdp_desc;
+struct device;
+struct page;
+
+struct xdp_buff_xsk {
+	struct xdp_buff xdp;
+	dma_addr_t dma;
+	dma_addr_t frame_dma;
+	struct xsk_buff_pool *pool;
+	bool unaligned;
+	u64 orig_addr;
+	struct list_head free_list_node;
+};
+
+/* AF_XDP core. */
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+				u32 chunk_size, u32 headroom, u64 size,
+				bool unaligned);
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
+void xp_destroy(struct xsk_buff_pool *pool);
+void xp_release(struct xdp_buff_xsk *xskb);
+u64 xp_get_handle(struct xdp_buff_xsk *xskb);
+bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
+
+/* AF_XDP, and XDP core. */
+void xp_free(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP ZC drivers, via xdp_sock_buff.h */
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+	       unsigned long attrs, struct page **pages, u32 nr_pages);
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
+dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb);
+dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb);
+void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb);
+void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma,
+			    size_t size);
+
+#endif /* XSK_BUFF_POOL_H_ */
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index b95d65e8c628..48547a12fa27 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -287,7 +287,8 @@ TRACE_EVENT(xdp_devmap_xmit,
 	FN(PAGE_SHARED)		\
 	FN(PAGE_ORDER0)		\
 	FN(PAGE_POOL)		\
-	FN(ZERO_COPY)
+	FN(ZERO_COPY)		\
+	FN(XSK_BUFF_POOL)
 
 #define __MEM_TYPE_TP_FN(x)	\
 	TRACE_DEFINE_ENUM(MEM_TYPE_##x);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 490b8f5fa8ee..f0ce8b195193 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -17,6 +17,7 @@
 #include <net/xdp.h>
 #include <net/xdp_priv.h> /* struct xdp_mem_allocator */
 #include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
 
 #define REG_STATE_NEW		0x0
 #define REG_STATE_REGISTERED	0x1
@@ -361,7 +362,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * of xdp_frames/pages in those cases.
  */
 static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-			 unsigned long handle)
+			 unsigned long handle, struct xdp_buff *xdp)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -390,6 +391,11 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		xa->zc_alloc->free(xa->zc_alloc, handle);
 		rcu_read_unlock();
+		break;
+	case MEM_TYPE_XSK_BUFF_POOL:
+		/* NB! Only valid from an xdp_buff! */
+		xsk_buff_free(xdp);
+		break;
 	default:
 		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
 		break;
@@ -398,19 +404,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false, 0);
+	__xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true, 0);
+	__xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 90b5460d6166..30cdc4315f42 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 37ace3bc0d48..7f04688045d5 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -245,7 +245,7 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	}
 
 	xsk_reuseq_destroy(umem);
-
+	xp_destroy(umem->pool);
 	xdp_umem_unmap_pages(umem);
 	xdp_umem_unpin_pages(umem);
 
@@ -390,6 +390,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->size = size;
 	umem->headroom = headroom;
 	umem->chunk_size_nohr = chunk_size - headroom;
+	umem->chunk_size = chunk_size;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
@@ -415,11 +416,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	}
 
 	err = xdp_umem_map_pages(umem);
-	if (!err)
-		return 0;
+	if (err)
+		goto out_pages;
 
-	kvfree(umem->pages);
+	umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
+			       headroom, size, unaligned_chunks);
+	if (!umem->pool) {
+		err = -ENOMEM;
+		goto out_unmap;
+	}
+	return 0;
 
+out_unmap:
+	xdp_umem_unmap_pages(umem);
+out_pages:
+	kvfree(umem->pages);
 out_pin:
 	xdp_umem_unpin_pages(umem);
 out_account:
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 8bda654e82ec..6933f0d494ba 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -117,76 +117,67 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 
-/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
- * each page. This is only required in copy mode.
- */
-static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
-			     u32 len, u32 metalen)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	void *to_buf = xdp_umem_get_data(umem, addr);
-
-	addr = xsk_umem_add_offset_to_addr(addr);
-	if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
-		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
-		u64 page_start = addr & ~(PAGE_SIZE - 1);
-		u64 first_len = PAGE_SIZE - (addr - page_start);
-
-		memcpy(to_buf, from_buf, first_len);
-		memcpy(next_pg_addr, from_buf + first_len,
-		       len + metalen - first_len);
+	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+	u64 addr;
+	int err;
 
-		return;
+	addr = xp_get_handle(xskb);
+	err = xskq_prod_reserve_desc(xs->rx, addr, len);
+	if (err) {
+		xs->rx_dropped++;
+		return err;
 	}
 
-	memcpy(to_buf, from_buf, len + metalen);
+	xp_release(xskb);
+	return 0;
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
 {
-	u64 offset = xs->umem->headroom;
-	u64 addr, memcpy_addr;
-	void *from_buf;
+	void *from_buf, *to_buf;
 	u32 metalen;
-	int err;
-
-	if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
-	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
-		xs->rx_dropped++;
-		return -ENOSPC;
-	}
 
-	if (unlikely(xdp_data_meta_unsupported(xdp))) {
-		from_buf = xdp->data;
+	if (unlikely(xdp_data_meta_unsupported(from))) {
+		from_buf = from->data;
+		to_buf = to->data;
 		metalen = 0;
 	} else {
-		from_buf = xdp->data_meta;
-		metalen = xdp->data - xdp->data_meta;
+		from_buf = from->data_meta;
+		metalen = from->data - from->data_meta;
+		to_buf = to->data - metalen;
 	}
 
-	memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-	__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
-
-	offset += metalen;
-	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-	err = xskq_prod_reserve_desc(xs->rx, addr, len);
-	if (!err) {
-		xskq_cons_release(xs->umem->fq);
-		xdp_return_buff(xdp);
-		return 0;
-	}
-
-	xs->rx_dropped++;
-	return err;
+	memcpy(to_buf, from_buf, len + metalen);
 }
 
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
+		     bool explicit_free)
 {
-	int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
+	struct xdp_buff *xsk_xdp;
+	int err;
 
-	if (err)
+	if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	xsk_xdp = xsk_buff_alloc(xs->umem);
+	if (!xsk_xdp) {
 		xs->rx_dropped++;
+		return -ENOSPC;
+	}
 
-	return err;
+	xsk_copy_xdp(xsk_xdp, xdp, len);
+	err = __xsk_rcv_zc(xs, xsk_xdp, len);
+	if (err) {
+		xsk_buff_free(xsk_xdp);
+		return err;
+	}
+	if (explicit_free)
+		xdp_return_buff(xdp);
+	return 0;
 }
 
 static bool xsk_is_bound(struct xdp_sock *xs)
@@ -199,7 +190,8 @@ static bool xsk_is_bound(struct xdp_sock *xs)
 	return false;
 }
 
-static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
+		   bool explicit_free)
 {
 	u32 len;
 
@@ -211,8 +203,10 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 	len = xdp->data_end - xdp->data;
 
-	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
-		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+	return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY ||
+		xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+		__xsk_rcv_zc(xs, xdp, len) :
+		__xsk_rcv(xs, xdp, len, explicit_free);
 }
 
 static void xsk_flush(struct xdp_sock *xs)
@@ -224,46 +218,11 @@ static void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 metalen = xdp->data - xdp->data_meta;
-	u32 len = xdp->data_end - xdp->data;
-	u64 offset = xs->umem->headroom;
-	void *buffer;
-	u64 addr;
 	int err;
 
 	spin_lock_bh(&xs->rx_lock);
-
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
-		err = -EINVAL;
-		goto out_unlock;
-	}
-
-	if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
-	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
-		err = -ENOSPC;
-		goto out_drop;
-	}
-
-	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-	buffer = xdp_umem_get_data(xs->umem, addr);
-	memcpy(buffer, xdp->data_meta, len + metalen);
-
-	addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
-	err = xskq_prod_reserve_desc(xs->rx, addr, len);
-	if (err)
-		goto out_drop;
-
-	xskq_cons_release(xs->umem->fq);
-	xskq_prod_submit(xs->rx);
-
-	spin_unlock_bh(&xs->rx_lock);
-
-	xs->sk.sk_data_ready(&xs->sk);
-	return 0;
-
-out_drop:
-	xs->rx_dropped++;
-out_unlock:
+	err = xsk_rcv(xs, xdp, false);
+	xsk_flush(xs);
 	spin_unlock_bh(&xs->rx_lock);
 	return err;
 }
@@ -273,7 +232,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 	int err;
 
-	err = xsk_rcv(xs, xdp);
+	err = xsk_rcv(xs, xdp, true);
 	if (err)
 		return err;
 
@@ -404,7 +363,7 @@ static int xsk_generic_xmit(struct sock *sk)
 
 		skb_put(skb, len);
 		addr = desc.addr;
-		buffer = xdp_umem_get_data(xs->umem, addr);
+		buffer = xsk_buff_raw_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
 		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
@@ -860,6 +819,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 			&xs->umem->cq;
 		err = xsk_init_queue(entries, q, true);
+		if (optname == XDP_UMEM_FILL_RING)
+			xp_set_fq(xs->umem->pool, *q);
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
new file mode 100644
index 000000000000..e214a5795a62
--- /dev/null
+++ b/net/xdp/xsk_buff_pool.c
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/swiotlb.h>
+
+#include "xsk_queue.h"
+
+struct xsk_buff_pool {
+	struct xsk_queue *fq;
+	struct list_head free_list;
+	dma_addr_t *dma_pages;
+	struct xdp_buff_xsk *heads;
+	u64 chunk_mask;
+	u64 addrs_cnt;
+	u32 free_list_cnt;
+	u32 dma_pages_cnt;
+	u32 heads_cnt;
+	u32 free_heads_cnt;
+	u32 headroom;
+	u32 chunk_size;
+	u32 frame_len;
+	bool cheap_dma;
+	bool unaligned;
+	void *addrs;
+	struct device *dev;
+	struct xdp_buff_xsk *free_heads[];
+};
+
+static void xp_addr_unmap(struct xsk_buff_pool *pool)
+{
+	vunmap(pool->addrs);
+}
+
+static int xp_addr_map(struct xsk_buff_pool *pool,
+		       struct page **pages, u32 nr_pages)
+{
+	pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!pool->addrs)
+		return -ENOMEM;
+	return 0;
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+	if (!pool)
+		return;
+
+	xp_addr_unmap(pool);
+	kvfree(pool->heads);
+	kvfree(pool);
+}
+
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+				u32 chunk_size, u32 headroom, u64 size,
+				bool unaligned)
+{
+	struct xsk_buff_pool *pool;
+	struct xdp_buff_xsk *xskb;
+	int err;
+	u32 i;
+
+	pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+	if (!pool)
+		goto out;
+
+	pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+	if (!pool->heads)
+		goto out;
+
+	pool->chunk_mask = ~((u64)chunk_size - 1);
+	pool->addrs_cnt = size;
+	pool->heads_cnt = chunks;
+	pool->free_heads_cnt = chunks;
+	pool->headroom = headroom;
+	pool->chunk_size = chunk_size;
+	pool->cheap_dma = true;
+	pool->unaligned = unaligned;
+	pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+	INIT_LIST_HEAD(&pool->free_list);
+
+	for (i = 0; i < pool->free_heads_cnt; i++) {
+		xskb = &pool->heads[i];
+		xskb->pool = pool;
+		xskb->xdp.frame_sz = chunk_size - headroom;
+		pool->free_heads[i] = xskb;
+	}
+
+	err = xp_addr_map(pool, pages, nr_pages);
+	if (!err)
+		return pool;
+
+out:
+	xp_destroy(pool);
+	return NULL;
+}
+
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
+{
+	pool->fq = fq;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+	u32 i;
+
+	for (i = 0; i < pool->heads_cnt; i++)
+		pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+	dma_addr_t *dma;
+	u32 i;
+
+	if (pool->dma_pages_cnt == 0)
+		return;
+
+	for (i = 0; i < pool->dma_pages_cnt; i++) {
+		dma = &pool->dma_pages[i];
+		if (*dma) {
+			dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+					     DMA_BIDIRECTIONAL, attrs);
+			*dma = 0;
+		}
+	}
+
+	kvfree(pool->dma_pages);
+	pool->dma_pages_cnt = 0;
+	pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+{
+	u32 i;
+
+	for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
+		if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
+			pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+		else
+			pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+	}
+}
+
+static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_SWIOTLB)
+	phys_addr_t paddr;
+	u32 i;
+
+	for (i = 0; i < pool->dma_pages_cnt; i++) {
+		paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
+		if (is_swiotlb_buffer(paddr))
+			return false;
+	}
+#endif
+	return true;
+}
+
+static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_HAS_DMA)
+	const struct dma_map_ops *ops = get_dma_ops(pool->dev);
+
+	if (ops) {
+		return !ops->sync_single_for_cpu &&
+			!ops->sync_single_for_device;
+	}
+
+	if (!dma_is_direct(ops))
+		return false;
+
+	if (!xp_check_swiotlb_dma(pool))
+		return false;
+
+	if (!dev_is_dma_coherent(pool->dev)) {
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) ||		\
+	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) ||	\
+	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+		return false;
+#endif
+	}
+#endif
+	return true;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+	       unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+	dma_addr_t dma;
+	u32 i;
+
+	pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
+				   GFP_KERNEL);
+	if (!pool->dma_pages)
+		return -ENOMEM;
+
+	pool->dev = dev;
+	pool->dma_pages_cnt = nr_pages;
+
+	for (i = 0; i < pool->dma_pages_cnt; i++) {
+		dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+					 DMA_BIDIRECTIONAL, attrs);
+		if (dma_mapping_error(dev, dma)) {
+			xp_dma_unmap(pool, attrs);
+			return -ENOMEM;
+		}
+		pool->dma_pages[i] = dma;
+	}
+
+	if (pool->unaligned)
+		xp_check_dma_contiguity(pool);
+
+	pool->dev = dev;
+	pool->cheap_dma = xp_check_cheap_dma(pool);
+	return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+					  u64 addr, u32 len)
+{
+	bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+
+	if (pool->dma_pages_cnt && cross_pg) {
+		return !(pool->dma_pages[addr >> PAGE_SHIFT] &
+			 XSK_NEXT_PG_CONTIG_MASK);
+	}
+	return false;
+}
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+					  u64 addr)
+{
+	return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+void xp_release(struct xdp_buff_xsk *xskb)
+{
+	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
+
+static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
+{
+	return addr & pool->chunk_mask;
+}
+
+static u64 xp_unaligned_extract_addr(u64 addr)
+{
+	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static u64 xp_unaligned_extract_offset(u64 addr)
+{
+	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static u64 xp_unaligned_add_offset_to_addr(u64 addr)
+{
+	return xp_unaligned_extract_addr(addr) +
+		xp_unaligned_extract_offset(addr);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+	*addr = xp_unaligned_extract_addr(*addr);
+	if (*addr >= pool->addrs_cnt ||
+	    *addr + pool->chunk_size > pool->addrs_cnt ||
+	    xp_addr_crosses_non_contig_pg(pool, *addr))
+		return false;
+	return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+	*addr = xp_aligned_extract_addr(pool, *addr);
+	return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+	struct xdp_buff_xsk *xskb;
+	u64 addr;
+	bool ok;
+
+	if (pool->free_heads_cnt == 0)
+		return NULL;
+
+	xskb = pool->free_heads[--pool->free_heads_cnt];
+
+	for (;;) {
+		if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+			xp_release(xskb);
+			return NULL;
+		}
+
+		ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+		     xp_check_aligned(pool, &addr);
+		if (!ok) {
+			pool->fq->invalid_descs++;
+			xskq_cons_release(pool->fq);
+			continue;
+		}
+		break;
+	}
+	xskq_cons_release(pool->fq);
+
+	xskb->orig_addr = addr;
+	xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+	if (pool->dma_pages_cnt) {
+		xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
+				   ~XSK_NEXT_PG_CONTIG_MASK) +
+				  (addr & ~PAGE_MASK);
+		xskb->dma = xskb->frame_dma + pool->headroom +
+			    XDP_PACKET_HEADROOM;
+	}
+	return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+	struct xdp_buff_xsk *xskb;
+
+	if (!pool->free_list_cnt) {
+		xskb = __xp_alloc(pool);
+		if (!xskb)
+			return NULL;
+	} else {
+		pool->free_list_cnt--;
+		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+					free_list_node);
+		list_del(&xskb->free_list_node);
+	}
+
+	xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+	xskb->xdp.data_meta = xskb->xdp.data;
+
+	if (!pool->cheap_dma) {
+		dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+						 pool->frame_len,
+						 DMA_BIDIRECTIONAL);
+	}
+	return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+	if (pool->free_list_cnt >= count)
+		return true;
+	return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+	xskb->pool->free_list_cnt++;
+	list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+				     struct xdp_desc *desc)
+{
+	u64 chunk, chunk_end;
+
+	chunk = xp_aligned_extract_addr(pool, desc->addr);
+	chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
+	if (chunk != chunk_end)
+		return false;
+
+	if (chunk >= pool->addrs_cnt)
+		return false;
+
+	if (desc->options)
+		return false;
+	return true;
+}
+
+static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+				       struct xdp_desc *desc)
+{
+	u64 addr, base_addr;
+
+	base_addr = xp_unaligned_extract_addr(desc->addr);
+	addr = xp_unaligned_add_offset_to_addr(desc->addr);
+
+	if (desc->len > pool->chunk_size)
+		return false;
+
+	if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+		return false;
+
+	if (desc->options)
+		return false;
+	return true;
+}
+
+bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
+{
+	return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+		xp_aligned_validate_desc(pool, desc);
+}
+
+u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
+
+	offset += xskb->pool->headroom;
+	if (!xskb->pool->unaligned)
+		return xskb->orig_addr + offset;
+	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+}
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+	addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+	return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+	addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+	return (pool->dma_pages[addr >> PAGE_SHIFT] &
+		~XSK_NEXT_PG_CONTIG_MASK) +
+		(addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
+{
+	return xskb->dma;
+}
+EXPORT_SYMBOL(xp_get_dma);
+
+dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+{
+	return xskb->frame_dma;
+}
+EXPORT_SYMBOL(xp_get_frame_dma);
+
+void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
+{
+	if (xskb->pool->cheap_dma)
+		return;
+
+	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu);
+
+void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma,
+			    size_t size)
+{
+	if (pool->cheap_dma)
+		return;
+
+	dma_sync_single_range_for_device(pool->dev, dma, 0,
+					 size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index f59791ba43a0..0163b26aaf63 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
 	du.id = umem->id;
 	du.size = umem->size;
 	du.num_pages = umem->npgs;
-	du.chunk_size = umem->chunk_size_nohr + umem->headroom;
+	du.chunk_size = umem->chunk_size;
 	du.headroom = umem->headroom;
 	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
 	du.queue_id = umem->queue_id;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index a322a7dac58c..9151aef7dbca 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/if_xdp.h>
 #include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
 
 #include "xsk.h"
 
@@ -172,31 +173,45 @@ out:
 	return false;
 }
 
-static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
-					   struct xdp_desc *d,
-					   struct xdp_umem *umem)
+static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr)
 {
-	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-		if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
-			return false;
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-		if (d->len > umem->chunk_size_nohr || d->options) {
-			q->invalid_descs++;
-			return false;
-		}
+	while (q->cached_cons != q->cached_prod) {
+		u32 idx = q->cached_cons & q->ring_mask;
+
+		*addr = ring->desc[idx];
+		if (xskq_cons_is_valid_addr(q, *addr))
+			return true;
 
+		q->cached_cons++;
+	}
+
+	return false;
+}
+
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	if (q->cached_cons != q->cached_prod) {
+		u32 idx = q->cached_cons & q->ring_mask;
+
+		*addr = ring->desc[idx];
 		return true;
 	}
 
-	if (!xskq_cons_is_valid_addr(q, d->addr))
-		return false;
+	return false;
+}
 
-	if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
-	    d->options) {
+static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
+					   struct xdp_desc *d,
+					   struct xdp_umem *umem)
+{
+	if (!xp_validate_desc(umem->pool, d)) {
 		q->invalid_descs++;
 		return false;
 	}
-
 	return true;
 }
 
@@ -260,6 +275,20 @@ static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
 	return xskq_cons_read_addr(q, addr, umem);
 }
 
+static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr)
+{
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_addr_aligned(q, addr);
+}
+
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_addr_unchecked(q, addr);
+}
+
 static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
 				       struct xdp_desc *desc,
 				       struct xdp_umem *umem)
-- 
cgit v1.2.3


From 0807892ecb35734b7ce6f7c29b078f1b60151c94 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:21:00 +0200
Subject: xsk: Remove MEM_TYPE_ZERO_COPY and corresponding code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users of MEM_TYPE_ZERO_COPY. Remove all corresponding
code, including the "handle" member of struct xdp_buff.

rfc->v1: Fixed spelling in commit message. (Björn)

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-13-bjorn.topel@gmail.com
---
 drivers/net/hyperv/netvsc_bpf.c |   1 -
 include/net/xdp.h               |   9 +--
 include/net/xdp_sock.h          |  45 ------------
 include/net/xdp_sock_drv.h      | 149 ----------------------------------------
 include/trace/events/xdp.h      |   1 -
 net/core/xdp.c                  |  42 ++---------
 net/xdp/xdp_umem.c              |  56 +--------------
 net/xdp/xsk.c                   |  48 +------------
 net/xdp/xsk_buff_pool.c         |   7 ++
 net/xdp/xsk_queue.c             |  62 -----------------
 net/xdp/xsk_queue.h             | 105 ----------------------------
 11 files changed, 15 insertions(+), 510 deletions(-)

(limited to 'net')

diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c
index 1e0c024b0a93..8e4141552423 100644
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
 	xdp->data_end = xdp->data + len;
 	xdp->rxq = &nvchan->xdp_rxq;
 	xdp->frame_sz = PAGE_SIZE;
-	xdp->handle = 0;
 
 	memcpy(xdp->data, data, len);
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index f432134c7c00..90f11760bd12 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -39,7 +39,6 @@ enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
 	MEM_TYPE_PAGE_POOL,
-	MEM_TYPE_ZERO_COPY,
 	MEM_TYPE_XSK_BUFF_POOL,
 	MEM_TYPE_MAX,
 };
@@ -55,10 +54,6 @@ struct xdp_mem_info {
 
 struct page_pool;
 
-struct zero_copy_allocator {
-	void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
-};
-
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
@@ -71,7 +66,6 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
-	unsigned long handle;
 	struct xdp_rxq_info *rxq;
 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
 };
@@ -120,8 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
-	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY ||
-	    xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
+	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
 		return xdp_convert_zc_to_xdp_frame(xdp);
 
 	/* Assure headroom is available for storing info */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 6e7265f63c04..96bfc5f5f24e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -17,26 +17,12 @@ struct net_device;
 struct xsk_queue;
 struct xdp_buff;
 
-struct xdp_umem_page {
-	void *addr;
-	dma_addr_t dma;
-};
-
-struct xdp_umem_fq_reuse {
-	u32 nentries;
-	u32 length;
-	u64 handles[];
-};
-
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
 	struct xsk_buff_pool *pool;
-	struct xdp_umem_page *pages;
-	u64 chunk_mask;
 	u64 size;
 	u32 headroom;
-	u32 chunk_size_nohr;
 	u32 chunk_size;
 	struct user_struct *user;
 	refcount_t users;
@@ -48,7 +34,6 @@ struct xdp_umem {
 	u8 flags;
 	int id;
 	struct net_device *dev;
-	struct xdp_umem_fq_reuse *fq_reuse;
 	bool zc;
 	spinlock_t xsk_tx_list_lock;
 	struct list_head xsk_tx_list;
@@ -109,21 +94,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 	return xs;
 }
 
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
-	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
-	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
-	return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
-}
-
 #else
 
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -146,21 +116,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
-	return 0;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
-	return 0;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
-	return 0;
-}
-
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 7752c8663d1b..ccf848f7efa4 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -11,16 +11,9 @@
 
 #ifdef CONFIG_XDP_SOCKETS
 
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_release_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
-					  struct xdp_umem_fq_reuse *newq);
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
 void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
 void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
@@ -28,80 +21,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-	unsigned long page_addr;
-
-	addr = xsk_umem_add_offset_to_addr(addr);
-	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
-
-	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-	addr = xsk_umem_add_offset_to_addr(addr);
-
-	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
-}
-
-/* Reuse-queue aware version of FILL queue helpers */
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (rq->length >= cnt)
-		return true;
-
-	return xsk_umem_has_addrs(umem, cnt - rq->length);
-}
-
-static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (!rq->length)
-		return xsk_umem_peek_addr(umem, addr);
-
-	*addr = rq->handles[rq->length - 1];
-	return addr;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	if (!rq->length)
-		xsk_umem_release_addr(umem);
-	else
-		rq->length--;
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-	rq->handles[rq->length++] = addr;
-}
-
-/* Handle the offset appropriately depending on aligned or unaligned mode.
- * For unaligned mode, we store the offset in the upper 16-bits of the address.
- * For aligned mode, we simply add the offset to the address.
- */
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
-					 u64 offset)
-{
-	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
-		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
-	else
-		return address + offset;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-	return umem->chunk_size_nohr;
-}
-
 static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
 {
 	return XDP_PACKET_HEADROOM + umem->headroom;
@@ -192,20 +111,6 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
 
 #else
 
-static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
-	return false;
-}
-
-static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
-	return NULL;
-}
-
-static inline void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-}
-
 static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
 }
@@ -220,55 +125,12 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 {
 }
 
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
-	return NULL;
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
-	struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq)
-{
-	return NULL;
-}
-
-static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-}
-
 static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 						     u16 queue_id)
 {
 	return NULL;
 }
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-	return NULL;
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-	return 0;
-}
-
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-	return false;
-}
-
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-	return NULL;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-}
-
 static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
 {
 }
@@ -290,17 +152,6 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 	return false;
 }
 
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
-					 u64 offset)
-{
-	return 0;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-	return 0;
-}
-
 static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
 {
 	return 0;
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 48547a12fa27..b73d3e141323 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -287,7 +287,6 @@ TRACE_EVENT(xdp_devmap_xmit,
 	FN(PAGE_SHARED)		\
 	FN(PAGE_ORDER0)		\
 	FN(PAGE_POOL)		\
-	FN(ZERO_COPY)		\
 	FN(XSK_BUFF_POOL)
 
 #define __MEM_TYPE_TP_FN(x)	\
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f0ce8b195193..a8c2f243367d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -110,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator)
 	mutex_unlock(&mem_id_lock);
 }
 
-static void mem_id_disconnect(int id)
-{
-	struct xdp_mem_allocator *xa;
-
-	mutex_lock(&mem_id_lock);
-
-	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-	if (!xa) {
-		mutex_unlock(&mem_id_lock);
-		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
-		return;
-	}
-
-	trace_mem_disconnect(xa);
-
-	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
-		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
-	mutex_unlock(&mem_id_lock);
-}
-
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 {
 	struct xdp_mem_allocator *xa;
@@ -144,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 	if (id == 0)
 		return;
 
-	if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
-		return mem_id_disconnect(id);
-
 	if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
 		rcu_read_lock();
 		xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
@@ -302,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	xdp_rxq->mem.type = type;
 
 	if (!allocator) {
-		if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+		if (type == MEM_TYPE_PAGE_POOL)
 			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
 	}
@@ -362,7 +338,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * of xdp_frames/pages in those cases.
  */
 static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-			 unsigned long handle, struct xdp_buff *xdp)
+			 struct xdp_buff *xdp)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -384,14 +360,6 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
 		break;
-	case MEM_TYPE_ZERO_COPY:
-		/* NB! Only valid from an xdp_buff! */
-		rcu_read_lock();
-		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
-		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-		xa->zc_alloc->free(xa->zc_alloc, handle);
-		rcu_read_unlock();
-		break;
 	case MEM_TYPE_XSK_BUFF_POOL:
 		/* NB! Only valid from an xdp_buff! */
 		xsk_buff_free(xdp);
@@ -404,19 +372,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL);
+	__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL);
+	__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 7f04688045d5..19e59d1a5e9f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
 	umem->zc = false;
 }
 
-static void xdp_umem_unmap_pages(struct xdp_umem *umem)
-{
-	unsigned int i;
-
-	for (i = 0; i < umem->npgs; i++)
-		if (PageHighMem(umem->pgs[i]))
-			vunmap(umem->pages[i].addr);
-}
-
-static int xdp_umem_map_pages(struct xdp_umem *umem)
-{
-	unsigned int i;
-	void *addr;
-
-	for (i = 0; i < umem->npgs; i++) {
-		if (PageHighMem(umem->pgs[i]))
-			addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
-		else
-			addr = page_address(umem->pgs[i]);
-
-		if (!addr) {
-			xdp_umem_unmap_pages(umem);
-			return -ENOMEM;
-		}
-
-		umem->pages[i].addr = addr;
-	}
-
-	return 0;
-}
-
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->cq = NULL;
 	}
 
-	xsk_reuseq_destroy(umem);
 	xp_destroy(umem->pool);
-	xdp_umem_unmap_pages(umem);
 	xdp_umem_unpin_pages(umem);
 
-	kvfree(umem->pages);
-	umem->pages = NULL;
-
 	xdp_umem_unaccount_pages(umem);
 	kfree(umem);
 }
@@ -385,11 +349,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
 		return -EINVAL;
 
-	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
-					    : ~((u64)chunk_size - 1);
 	umem->size = size;
 	umem->headroom = headroom;
-	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->chunk_size = chunk_size;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
@@ -408,29 +369,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (err)
 		goto out_account;
 
-	umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
-			       GFP_KERNEL_ACCOUNT);
-	if (!umem->pages) {
-		err = -ENOMEM;
-		goto out_pin;
-	}
-
-	err = xdp_umem_map_pages(umem);
-	if (err)
-		goto out_pages;
-
 	umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
 			       headroom, size, unaligned_chunks);
 	if (!umem->pool) {
 		err = -ENOMEM;
-		goto out_unmap;
+		goto out_pin;
 	}
 	return 0;
 
-out_unmap:
-	xdp_umem_unmap_pages(umem);
-out_pages:
-	kvfree(umem->pages);
 out_pin:
 	xdp_umem_unpin_pages(umem);
 out_account:
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 6933f0d494ba..3f2ab732ab8b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 		READ_ONCE(xs->umem->fq);
 }
 
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
-	return xskq_cons_has_entries(umem->fq, cnt);
-}
-EXPORT_SYMBOL(xsk_umem_has_addrs);
-
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
-	return xskq_cons_peek_addr(umem->fq, addr, umem);
-}
-EXPORT_SYMBOL(xsk_umem_peek_addr);
-
-void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-	xskq_cons_release(umem->fq);
-}
-EXPORT_SYMBOL(xsk_umem_release_addr);
-
 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
 {
 	if (umem->need_wakeup & XDP_WAKEUP_RX)
@@ -203,8 +185,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
 
 	len = xdp->data_end - xdp->data;
 
-	return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY ||
-		xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
 		__xsk_rcv_zc(xs, xdp, len) :
 		__xsk_rcv(xs, xdp, len, explicit_free);
 }
@@ -588,24 +569,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
 	return sock;
 }
 
-/* Check if umem pages are contiguous.
- * If zero-copy mode, use the DMA address to do the page contiguity check
- * For all other modes we use addr (kernel virtual address)
- * Store the result in the low bits of addr.
- */
-static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
-{
-	struct xdp_umem_page *pgs = umem->pages;
-	int i, is_contig;
-
-	for (i = 0; i < umem->npgs - 1; i++) {
-		is_contig = (flags & XDP_ZEROCOPY) ?
-			(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
-			(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
-		pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
-	}
-}
-
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -688,23 +651,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	} else {
 		/* This xsk has its own umem. */
-		xskq_set_umem(xs->umem->fq, xs->umem->size,
-			      xs->umem->chunk_mask);
-		xskq_set_umem(xs->umem->cq, xs->umem->size,
-			      xs->umem->chunk_mask);
-
 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 		if (err)
 			goto out_unlock;
-
-		xsk_check_page_contiguity(xs->umem, flags);
 	}
 
 	xs->dev = dev;
 	xs->zc = xs->umem->zc;
 	xs->queue_id = qid;
-	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
-	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index e214a5795a62..89dae78865e7 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -8,6 +8,13 @@
 
 #include "xsk_queue.h"
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
 struct xsk_buff_pool {
 	struct xsk_queue *fq;
 	struct list_head free_list;
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 554b1ebb4d02..6cf9586e5027 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -10,15 +10,6 @@
 
 #include "xsk_queue.h"
 
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
-{
-	if (!q)
-		return;
-
-	q->umem_size = umem_size;
-	q->chunk_mask = chunk_mask;
-}
-
 static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
 {
 	struct xdp_umem_ring *umem_ring;
@@ -64,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q)
 	page_frag_free(q->ring);
 	kfree(q);
 }
-
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
-	struct xdp_umem_fq_reuse *newq;
-
-	/* Check for overflow */
-	if (nentries > (u32)roundup_pow_of_two(nentries))
-		return NULL;
-	nentries = roundup_pow_of_two(nentries);
-
-	newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
-	if (!newq)
-		return NULL;
-	memset(newq, 0, offsetof(typeof(*newq), handles));
-
-	newq->nentries = nentries;
-	return newq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
-
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
-					  struct xdp_umem_fq_reuse *newq)
-{
-	struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
-
-	if (!oldq) {
-		umem->fq_reuse = newq;
-		return NULL;
-	}
-
-	if (newq->nentries < oldq->length)
-		return newq;
-
-	memcpy(newq->handles, oldq->handles,
-	       array_size(oldq->length, sizeof(u64)));
-	newq->length = oldq->length;
-
-	umem->fq_reuse = newq;
-	return oldq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
-
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-	kvfree(rq);
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_free);
-
-void xsk_reuseq_destroy(struct xdp_umem *umem)
-{
-	xsk_reuseq_free(umem->fq_reuse);
-	umem->fq_reuse = NULL;
-}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 9151aef7dbca..16bf15864788 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,8 +32,6 @@ struct xdp_umem_ring {
 };
 
 struct xsk_queue {
-	u64 chunk_mask;
-	u64 umem_size;
 	u32 ring_mask;
 	u32 nentries;
 	u32 cached_prod;
@@ -106,90 +104,6 @@ struct xsk_queue {
 
 /* Functions that read and validate content from consumer rings. */
 
-static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
-						   u64 addr,
-						   u64 length)
-{
-	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
-	bool next_pg_contig =
-		(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
-			XSK_NEXT_PG_CONTIG_MASK;
-
-	return cross_pg && !next_pg_contig;
-}
-
-static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
-						u64 addr,
-						u64 length,
-						struct xdp_umem *umem)
-{
-	u64 base_addr = xsk_umem_extract_addr(addr);
-
-	addr = xsk_umem_add_offset_to_addr(addr);
-	if (base_addr >= q->umem_size || addr >= q->umem_size ||
-	    xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
-		q->invalid_descs++;
-		return false;
-	}
-
-	return true;
-}
-
-static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
-{
-	if (addr >= q->umem_size) {
-		q->invalid_descs++;
-		return false;
-	}
-
-	return true;
-}
-
-static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
-				       struct xdp_umem *umem)
-{
-	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
-	while (q->cached_cons != q->cached_prod) {
-		u32 idx = q->cached_cons & q->ring_mask;
-
-		*addr = ring->desc[idx] & q->chunk_mask;
-
-		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-			if (xskq_cons_is_valid_unaligned(q, *addr,
-							 umem->chunk_size_nohr,
-							 umem))
-				return true;
-			goto out;
-		}
-
-		if (xskq_cons_is_valid_addr(q, *addr))
-			return true;
-
-out:
-		q->cached_cons++;
-	}
-
-	return false;
-}
-
-static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr)
-{
-	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
-	while (q->cached_cons != q->cached_prod) {
-		u32 idx = q->cached_cons & q->ring_mask;
-
-		*addr = ring->desc[idx];
-		if (xskq_cons_is_valid_addr(q, *addr))
-			return true;
-
-		q->cached_cons++;
-	}
-
-	return false;
-}
-
 static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
@@ -267,21 +181,6 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
 	return entries >= cnt;
 }
 
-static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
-				       struct xdp_umem *umem)
-{
-	if (q->cached_prod == q->cached_cons)
-		xskq_cons_get_entries(q);
-	return xskq_cons_read_addr(q, addr, umem);
-}
-
-static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr)
-{
-	if (q->cached_prod == q->cached_cons)
-		xskq_cons_get_entries(q);
-	return xskq_cons_read_addr_aligned(q, addr);
-}
-
 static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cached_prod == q->cached_cons)
@@ -410,11 +309,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
 	return q ? q->invalid_descs : 0;
 }
 
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
-/* Executed by the core when the entire UMEM gets freed */
-void xsk_reuseq_destroy(struct xdp_umem *umem);
-
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3


From 82c41671ca4f597b6ff05bd5d118161deec26e07 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:21:01 +0200
Subject: xdp: Simplify xdp_return_{frame, frame_rx_napi, buff}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xdp_return_{frame,frame_rx_napi,buff} function are never used,
except in xdp_convert_zc_to_xdp_frame(), by the MEM_TYPE_XSK_BUFF_POOL
memory type.

To simplify and reduce code, change so that
xdp_convert_zc_to_xdp_frame() calls xsk_buff_free() directly since the
type is know, and remove MEM_TYPE_XSK_BUFF_POOL from the switch
statement in __xdp_return() function.

Suggested-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-14-bjorn.topel@gmail.com
---
 net/core/xdp.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/core/xdp.c b/net/core/xdp.c
index a8c2f243367d..90f44f382115 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -335,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * scenarios (e.g. queue full), it is possible to return the xdp_frame
  * while still leveraging this protection.  The @napi_direct boolean
  * is used for those calls sites.  Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases.
+ * of xdp_frames/pages in those cases. This path is never used by the
+ * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
+ * the switch-statement.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-			 struct xdp_buff *xdp)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -360,33 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
 		break;
-	case MEM_TYPE_XSK_BUFF_POOL:
-		/* NB! Only valid from an xdp_buff! */
-		xsk_buff_free(xdp);
-		break;
 	default:
 		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+		WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
 		break;
 	}
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+	__xdp_return(xdpf->data, &xdpf->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
-EXPORT_SYMBOL_GPL(xdp_return_buff);
 
 /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
 void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
@@ -467,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
 	xdpf->metasize = metasize;
 	xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
 
-	xdp_return_buff(xdp);
+	xsk_buff_free(xdp);
 	return xdpf;
 }
 EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
-- 
cgit v1.2.3


From 26062b185eee49142adc45f9aa187d909d02d961 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 20 May 2020 21:21:02 +0200
Subject: xsk: Explicitly inline functions and move definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to reduce the number of function calls, the struct
xsk_buff_pool definition is moved to xsk_buff_pool.h. The functions
xp_get_dma(), xp_dma_sync_for_cpu(), xp_dma_sync_for_device(),
xp_validate_desc() and various helper functions are explicitly
inlined.

Further, move xp_get_handle() and xp_release() to xsk.c, to allow for
the compiler to perform inlining.

rfc->v1: Make sure xp_validate_desc() is inlined for Tx perf. (Maxim)

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200520192103.355233-15-bjorn.topel@gmail.com
---
 include/net/xsk_buff_pool.h |  98 ++++++++++++++++++++++++++---
 net/xdp/xsk.c               |  15 +++++
 net/xdp/xsk_buff_pool.c     | 148 ++------------------------------------------
 net/xdp/xsk_queue.h         |  45 ++++++++++++++
 4 files changed, 156 insertions(+), 150 deletions(-)

(limited to 'net')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 9f221b36e405..a4ff226505c9 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -4,6 +4,7 @@
 #ifndef XSK_BUFF_POOL_H_
 #define XSK_BUFF_POOL_H_
 
+#include <linux/if_xdp.h>
 #include <linux/types.h>
 #include <linux/dma-mapping.h>
 #include <net/xdp.h>
@@ -25,6 +26,27 @@ struct xdp_buff_xsk {
 	struct list_head free_list_node;
 };
 
+struct xsk_buff_pool {
+	struct xsk_queue *fq;
+	struct list_head free_list;
+	dma_addr_t *dma_pages;
+	struct xdp_buff_xsk *heads;
+	u64 chunk_mask;
+	u64 addrs_cnt;
+	u32 free_list_cnt;
+	u32 dma_pages_cnt;
+	u32 heads_cnt;
+	u32 free_heads_cnt;
+	u32 headroom;
+	u32 chunk_size;
+	u32 frame_len;
+	bool cheap_dma;
+	bool unaligned;
+	void *addrs;
+	struct device *dev;
+	struct xdp_buff_xsk *free_heads[];
+};
+
 /* AF_XDP core. */
 struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
 				u32 chunk_size, u32 headroom, u64 size,
@@ -32,8 +54,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
 void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
 void xp_destroy(struct xsk_buff_pool *pool);
 void xp_release(struct xdp_buff_xsk *xskb);
-u64 xp_get_handle(struct xdp_buff_xsk *xskb);
-bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
 
 /* AF_XDP, and XDP core. */
 void xp_free(struct xdp_buff_xsk *xskb);
@@ -47,10 +67,74 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
 bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
 void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
 dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
-dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb);
-dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb);
-void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb);
-void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma,
-			    size_t size);
+static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
+{
+	return xskb->dma;
+}
+
+static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+{
+	return xskb->frame_dma;
+}
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
+static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
+{
+	if (xskb->pool->cheap_dma)
+		return;
+
+	xp_dma_sync_for_cpu_slow(xskb);
+}
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+				 size_t size);
+static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
+					  dma_addr_t dma, size_t size)
+{
+	if (pool->cheap_dma)
+		return;
+
+	xp_dma_sync_for_device_slow(pool, dma, size);
+}
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+						 u64 addr, u32 len)
+{
+	bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+
+	if (pool->dma_pages_cnt && cross_pg) {
+		return !(pool->dma_pages[addr >> PAGE_SHIFT] &
+			 XSK_NEXT_PG_CONTIG_MASK);
+	}
+	return false;
+}
+
+static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
+{
+	return addr & pool->chunk_mask;
+}
+
+static inline u64 xp_unaligned_extract_addr(u64 addr)
+{
+	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xp_unaligned_extract_offset(u64 addr)
+{
+	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
+{
+	return xp_unaligned_extract_addr(addr) +
+		xp_unaligned_extract_offset(addr);
+}
 
 #endif /* XSK_BUFF_POOL_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3f2ab732ab8b..b6c0f08bd80d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -99,6 +99,21 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 
+void xp_release(struct xdp_buff_xsk *xskb)
+{
+	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
+
+static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
+
+	offset += xskb->pool->headroom;
+	if (!xskb->pool->unaligned)
+		return xskb->orig_addr + offset;
+	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+}
+
 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 89dae78865e7..540ed75e4482 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -8,34 +8,6 @@
 
 #include "xsk_queue.h"
 
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
-
-struct xsk_buff_pool {
-	struct xsk_queue *fq;
-	struct list_head free_list;
-	dma_addr_t *dma_pages;
-	struct xdp_buff_xsk *heads;
-	u64 chunk_mask;
-	u64 addrs_cnt;
-	u32 free_list_cnt;
-	u32 dma_pages_cnt;
-	u32 heads_cnt;
-	u32 free_heads_cnt;
-	u32 headroom;
-	u32 chunk_size;
-	u32 frame_len;
-	bool cheap_dma;
-	bool unaligned;
-	void *addrs;
-	struct device *dev;
-	struct xdp_buff_xsk *free_heads[];
-};
-
 static void xp_addr_unmap(struct xsk_buff_pool *pool)
 {
 	vunmap(pool->addrs);
@@ -228,50 +200,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 }
 EXPORT_SYMBOL(xp_dma_map);
 
-static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
-					  u64 addr, u32 len)
-{
-	bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
-
-	if (pool->dma_pages_cnt && cross_pg) {
-		return !(pool->dma_pages[addr >> PAGE_SHIFT] &
-			 XSK_NEXT_PG_CONTIG_MASK);
-	}
-	return false;
-}
-
 static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
 					  u64 addr)
 {
 	return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
 }
 
-void xp_release(struct xdp_buff_xsk *xskb)
-{
-	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
-}
-
-static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
-{
-	return addr & pool->chunk_mask;
-}
-
-static u64 xp_unaligned_extract_addr(u64 addr)
-{
-	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
-}
-
-static u64 xp_unaligned_extract_offset(u64 addr)
-{
-	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
-}
-
-static u64 xp_unaligned_add_offset_to_addr(u64 addr)
-{
-	return xp_unaligned_extract_addr(addr) +
-		xp_unaligned_extract_offset(addr);
-}
-
 static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
 {
 	*addr = xp_unaligned_extract_addr(*addr);
@@ -370,60 +304,6 @@ void xp_free(struct xdp_buff_xsk *xskb)
 }
 EXPORT_SYMBOL(xp_free);
 
-static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
-				     struct xdp_desc *desc)
-{
-	u64 chunk, chunk_end;
-
-	chunk = xp_aligned_extract_addr(pool, desc->addr);
-	chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
-	if (chunk != chunk_end)
-		return false;
-
-	if (chunk >= pool->addrs_cnt)
-		return false;
-
-	if (desc->options)
-		return false;
-	return true;
-}
-
-static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
-				       struct xdp_desc *desc)
-{
-	u64 addr, base_addr;
-
-	base_addr = xp_unaligned_extract_addr(desc->addr);
-	addr = xp_unaligned_add_offset_to_addr(desc->addr);
-
-	if (desc->len > pool->chunk_size)
-		return false;
-
-	if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
-	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
-		return false;
-
-	if (desc->options)
-		return false;
-	return true;
-}
-
-bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
-{
-	return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
-		xp_aligned_validate_desc(pool, desc);
-}
-
-u64 xp_get_handle(struct xdp_buff_xsk *xskb)
-{
-	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
-
-	offset += xskb->pool->headroom;
-	if (!xskb->pool->unaligned)
-		return xskb->orig_addr + offset;
-	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
-}
-
 void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
 {
 	addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
@@ -440,35 +320,17 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
 }
 EXPORT_SYMBOL(xp_raw_get_dma);
 
-dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
-{
-	return xskb->dma;
-}
-EXPORT_SYMBOL(xp_get_dma);
-
-dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
 {
-	return xskb->frame_dma;
-}
-EXPORT_SYMBOL(xp_get_frame_dma);
-
-void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
-{
-	if (xskb->pool->cheap_dma)
-		return;
-
 	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
 				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
 }
-EXPORT_SYMBOL(xp_dma_sync_for_cpu);
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
 
-void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma,
-			    size_t size)
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+				 size_t size)
 {
-	if (pool->cheap_dma)
-		return;
-
 	dma_sync_single_range_for_device(pool->dev, dma, 0,
 					 size, DMA_BIDIRECTIONAL);
 }
-EXPORT_SYMBOL(xp_dma_sync_for_device);
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 16bf15864788..5b5d24d2dd37 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -118,6 +118,51 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 	return false;
 }
 
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+					    struct xdp_desc *desc)
+{
+	u64 chunk, chunk_end;
+
+	chunk = xp_aligned_extract_addr(pool, desc->addr);
+	chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
+	if (chunk != chunk_end)
+		return false;
+
+	if (chunk >= pool->addrs_cnt)
+		return false;
+
+	if (desc->options)
+		return false;
+	return true;
+}
+
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+					      struct xdp_desc *desc)
+{
+	u64 addr, base_addr;
+
+	base_addr = xp_unaligned_extract_addr(desc->addr);
+	addr = xp_unaligned_add_offset_to_addr(desc->addr);
+
+	if (desc->len > pool->chunk_size)
+		return false;
+
+	if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+		return false;
+
+	if (desc->options)
+		return false;
+	return true;
+}
+
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+				    struct xdp_desc *desc)
+{
+	return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+		xp_aligned_validate_desc(pool, desc);
+}
+
 static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
 					   struct xdp_desc *d,
 					   struct xdp_umem *umem)
-- 
cgit v1.2.3


From 2092c910e2399c9e13b199c07421133681b9eaff Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Wed, 6 May 2020 22:13:30 +0200
Subject: batman-adv: Revert "Drop lockdep.h include for soft-interface.c"

The commit 1a33e10e4a95 ("net: partially revert dynamic lockdep key
changes") reverts the commit ab92d68fc22f ("net: core: add generic lockdep
keys"). But it forgot to also revert the commit 5759af0682b3 ("batman-adv:
Drop lockdep.h include for soft-interface.c") which depends on the latter.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/soft-interface.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 822af540b854..0ddd80130ea3 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/list.h>
+#include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <linux/percpu.h>
-- 
cgit v1.2.3


From cf78bb0bbcef3fbe1abf118f14b81dad36eaa94e Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Wed, 20 May 2020 10:41:40 +0200
Subject: batman-adv: use rcu_replace_pointer() where appropriate

In commit a63fc6b75cca ("rcu: Upgrade rcu_swap_protected() to
rcu_replace_pointer()") a new helper macro named rcu_replace_pointer() was
introduced to simplify code requiring to switch an rcu pointer to a new
value while extracting the old one.

Use rcu_replace_pointer() where appropriate to make code slimer.

Signed-off-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/gateway_client.c | 4 ++--
 net/batman-adv/hard-interface.c | 4 ++--
 net/batman-adv/routing.c        | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index e22e49289677..a18dcc686dc3 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -146,8 +146,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
 	if (new_gw_node)
 		kref_get(&new_gw_node->refcount);
 
-	curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
-	rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
+	curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node,
+					   true);
 
 	if (curr_gw_node)
 		batadv_gw_node_put(curr_gw_node);
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c7e98a40dd33..3a256af92784 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -473,8 +473,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
 	if (new_hard_iface)
 		kref_get(&new_hard_iface->refcount);
 
-	curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
-	rcu_assign_pointer(bat_priv->primary_if, new_hard_iface);
+	curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if,
+					      new_hard_iface, 1);
 
 	if (!new_hard_iface)
 		goto out;
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 3632bd976c56..d343382e9664 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -71,13 +71,13 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 	 * the code needs to ensure the curr_router variable contains a pointer
 	 * to the replaced best neighbor.
 	 */
-	curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
 
 	/* increase refcount of new best neighbor */
 	if (neigh_node)
 		kref_get(&neigh_node->refcount);
 
-	rcu_assign_pointer(orig_ifinfo->router, neigh_node);
+	curr_router = rcu_replace_pointer(orig_ifinfo->router, neigh_node,
+					  true);
 	spin_unlock_bh(&orig_node->neigh_list_lock);
 	batadv_orig_ifinfo_put(orig_ifinfo);
 
-- 
cgit v1.2.3


From 38428d68719c454d269cb03b776d8a4b0ad66111 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 21 May 2020 22:26:13 -0700
Subject: nexthop: support for fdb ecmp nexthops

This patch introduces ecmp nexthops and nexthop groups
for mac fdb entries. In subsequent patches this is used
by the vxlan driver fdb entries. The use case is
E-VPN multihoming [1,2,3] which requires bridged vxlan traffic
to be load balanced to remote switches (vteps) belonging to
the same multi-homed ethernet segment (This is analogous to
a multi-homed LAG but over vxlan).

Changes include new nexthop flag NHA_FDB for nexthops
referenced by fdb entries. These nexthops only have ip.
This patch includes appropriate checks to avoid routes
referencing such nexthops.

example:
$ip nexthop add id 12 via 172.16.1.2 fdb
$ip nexthop add id 13 via 172.16.1.3 fdb
$ip nexthop add id 102 group 12/13 fdb

$bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self

[1] E-VPN https://tools.ietf.org/html/rfc7432
[2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365
[3] LPC talk with mention of nexthop groups for L2 ecmp
http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf

v4 - fixed uninitialized variable reported by kernel test robot
Reported-by: kernel test robot <rong.a.chen@intel.com>

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h        |   1 +
 include/net/nexthop.h        |  32 +++++++++++
 include/uapi/linux/nexthop.h |   3 +
 net/ipv4/nexthop.c           | 132 +++++++++++++++++++++++++++++++++++--------
 net/ipv6/route.c             |   5 ++
 5 files changed, 148 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index fdaf975e3331..3f615a29766e 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -65,6 +65,7 @@ struct fib6_config {
 	struct nl_info	fc_nlinfo;
 	struct nlattr	*fc_encap;
 	u16		fc_encap_type;
+	bool		fc_is_fdb;
 };
 
 struct fib6_node {
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index c440ccc861fc..d929c98931ad 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -26,6 +26,7 @@ struct nh_config {
 	u8		nh_family;
 	u8		nh_protocol;
 	u8		nh_blackhole;
+	u8		nh_fdb;
 	u32		nh_flags;
 
 	int		nh_ifindex;
@@ -52,6 +53,7 @@ struct nh_info {
 
 	u8			family;
 	bool			reject_nh;
+	bool			fdb_nh;
 
 	union {
 		struct fib_nh_common	fib_nhc;
@@ -80,6 +82,7 @@ struct nexthop {
 	struct rb_node		rb_node;    /* entry on netns rbtree */
 	struct list_head	fi_list;    /* v4 entries using nh */
 	struct list_head	f6i_list;   /* v6 entries using nh */
+	struct list_head        fdb_list;   /* fdb entries using this nh */
 	struct list_head	grp_list;   /* nh group entries using this nh */
 	struct net		*net;
 
@@ -88,6 +91,7 @@ struct nexthop {
 	u8			protocol;   /* app managing this nh */
 	u8			nh_flags;
 	bool			is_group;
+	bool			is_fdb_nh;
 
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
@@ -304,4 +308,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
 int nexthop_for_each_fib6_nh(struct nexthop *nh,
 			     int (*cb)(struct fib6_nh *nh, void *arg),
 			     void *arg);
+
+static inline int nexthop_get_family(struct nexthop *nh)
+{
+	struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+	return nhi->family;
+}
+
+static inline
+struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
+{
+	struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+	return &nhi->fib_nhc;
+}
+
+static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
+							    int hash)
+{
+	struct nh_info *nhi;
+	struct nexthop *nhp;
+
+	nhp = nexthop_select_path(nh, hash);
+	if (unlikely(!nhp))
+		return NULL;
+	nhi = rcu_dereference(nhp->nh_info);
+	return &nhi->fib_nhc;
+}
 #endif
diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index 7b61867e9848..2d4a1e784cf0 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -49,6 +49,9 @@ enum {
 	NHA_GROUPS,	/* flag; only return nexthop groups in dump */
 	NHA_MASTER,	/* u32;  only return nexthops with given master dev */
 
+	NHA_FDB,	/* flag; nexthop belongs to a bridge fdb */
+	/* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
+
 	__NHA_MAX,
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 3957364d556c..bf91edc04631 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_ENCAP]		= { .type = NLA_NESTED },
 	[NHA_GROUPS]		= { .type = NLA_FLAG },
 	[NHA_MASTER]		= { .type = NLA_U32 },
+	[NHA_FDB]		= { .type = NLA_FLAG },
 };
 
 static unsigned int nh_dev_hashfn(unsigned int val)
@@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void)
 		INIT_LIST_HEAD(&nh->fi_list);
 		INIT_LIST_HEAD(&nh->f6i_list);
 		INIT_LIST_HEAD(&nh->grp_list);
+		INIT_LIST_HEAD(&nh->fdb_list);
 	}
 	return nh;
 }
@@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 	if (nla_put_u32(skb, NHA_ID, nh->id))
 		goto nla_put_failure;
 
+	if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
+		goto nla_put_failure;
+
 	if (nh->is_group) {
 		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 
@@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		if (nla_put_flag(skb, NHA_BLACKHOLE))
 			goto nla_put_failure;
 		goto out;
-	} else {
+	} else if (!nh->is_fdb_nh) {
 		const struct net_device *dev;
 
 		dev = nhi->fib_nhc.nhc_dev;
@@ -387,12 +392,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 	return true;
 }
 
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+				   struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+
+	if (!nh->is_fdb_nh) {
+		NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+		return -EINVAL;
+	}
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (*nh_family == AF_UNSPEC) {
+		*nh_family = nhi->family;
+	} else if (*nh_family != nhi->family) {
+		NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 			       struct netlink_ext_ack *extack)
 {
 	unsigned int len = nla_len(tb[NHA_GROUP]);
+	u8 nh_family = AF_UNSPEC;
 	struct nexthop_grp *nhg;
 	unsigned int i, j;
+	u8 nhg_fdb = 0;
 
 	if (len & (sizeof(struct nexthop_grp) - 1)) {
 		NL_SET_ERR_MSG(extack,
@@ -421,6 +449,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 		}
 	}
 
+	if (tb[NHA_FDB])
+		nhg_fdb = 1;
 	nhg = nla_data(tb[NHA_GROUP]);
 	for (i = 0; i < len; ++i) {
 		struct nexthop *nh;
@@ -432,11 +462,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 		}
 		if (!valid_group_nh(nh, len, extack))
 			return -EINVAL;
+
+		if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+			return -EINVAL;
+
+		if (!nhg_fdb && nh->is_fdb_nh) {
+			NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+			return -EINVAL;
+		}
 	}
 	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
 		if (!tb[i])
 			continue;
-
+		if (tb[NHA_FDB])
+			continue;
 		NL_SET_ERR_MSG(extack,
 			       "No other attributes can be set in nexthop groups");
 		return -EINVAL;
@@ -495,6 +534,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 		if (hash > atomic_read(&nhge->upper_bound))
 			continue;
 
+		if (nhge->nh->is_fdb_nh)
+			return nhge->nh;
+
 		/* nexthops always check if it is good and does
 		 * not rely on a sysctl for this behavior
 		 */
@@ -564,6 +606,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 {
 	struct nh_info *nhi;
 
+	if (nh->is_fdb_nh) {
+		NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+		return -EINVAL;
+	}
+
 	/* fib6_src is unique to a fib6_info and limits the ability to cache
 	 * routes in fib6_nh within a nexthop that is potentially shared
 	 * across multiple fib entries. If the config wants to use source
@@ -640,6 +687,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope,
 {
 	int err = 0;
 
+	if (nh->is_fdb_nh) {
+		NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+		err = -EINVAL;
+		goto out;
+	}
+
 	if (nh->is_group) {
 		struct nh_group *nhg;
 
@@ -1125,6 +1178,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
 		nh_group_rebalance(nhg);
 	}
 
+	if (cfg->nh_fdb)
+		nh->is_fdb_nh = 1;
+
 	rcu_assign_pointer(nh->nh_grp, nhg);
 
 	return nh;
@@ -1152,7 +1208,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 		.fc_encap = cfg->nh_encap,
 		.fc_encap_type = cfg->nh_encap_type,
 	};
-	u32 tb_id = l3mdev_fib_table(cfg->dev);
+	u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
 	int err;
 
 	err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
@@ -1161,6 +1217,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 		goto out;
 	}
 
+	if (nh->is_fdb_nh)
+		goto out;
+
 	/* sets nh_dev if successful */
 	err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
 	if (!err) {
@@ -1186,6 +1245,7 @@ static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
 		.fc_flags = cfg->nh_flags,
 		.fc_encap = cfg->nh_encap,
 		.fc_encap_type = cfg->nh_encap_type,
+		.fc_is_fdb = cfg->nh_fdb,
 	};
 	int err;
 
@@ -1227,6 +1287,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 	nhi->family = cfg->nh_family;
 	nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
 
+	if (cfg->nh_fdb)
+		nh->is_fdb_nh = 1;
+
 	if (cfg->nh_blackhole) {
 		nhi->reject_nh = 1;
 		cfg->nh_ifindex = net->loopback_dev->ifindex;
@@ -1248,7 +1311,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 	}
 
 	/* add the entry to the device based hash */
-	nexthop_devhash_add(net, nhi);
+	if (!nh->is_fdb_nh)
+		nexthop_devhash_add(net, nhi);
 
 	rcu_assign_pointer(nh->nh_info, nhi);
 
@@ -1352,6 +1416,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	if (tb[NHA_ID])
 		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+	if (tb[NHA_FDB]) {
+		if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
+			NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+			goto out;
+		}
+		if (nhm->nh_flags) {
+			NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+			goto out;
+		}
+		cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+	}
+
 	if (tb[NHA_GROUP]) {
 		if (nhm->nh_family != AF_UNSPEC) {
 			NL_SET_ERR_MSG(extack, "Invalid family for group");
@@ -1375,8 +1452,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 
 	if (tb[NHA_BLACKHOLE]) {
 		if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
-		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
-			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
 			goto out;
 		}
 
@@ -1385,26 +1462,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
-	if (!tb[NHA_OIF]) {
-		NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+	if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+		NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
 		goto out;
 	}
 
-	cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
-	if (cfg->nh_ifindex)
-		cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+	if (!cfg->nh_fdb && tb[NHA_OIF]) {
+		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+		if (cfg->nh_ifindex)
+			cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
 
-	if (!cfg->dev) {
-		NL_SET_ERR_MSG(extack, "Invalid device index");
-		goto out;
-	} else if (!(cfg->dev->flags & IFF_UP)) {
-		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
-		err = -ENETDOWN;
-		goto out;
-	} else if (!netif_carrier_ok(cfg->dev)) {
-		NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
-		err = -ENETDOWN;
-		goto out;
+		if (!cfg->dev) {
+			NL_SET_ERR_MSG(extack, "Invalid device index");
+			goto out;
+		} else if (!(cfg->dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		} else if (!netif_carrier_ok(cfg->dev)) {
+			NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+			err = -ENETDOWN;
+			goto out;
+		}
 	}
 
 	err = -EINVAL;
@@ -1633,7 +1712,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
 
 static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
 			     int *master_idx, bool *group_filter,
-			     struct netlink_callback *cb)
+			     bool *fdb_filter, struct netlink_callback *cb)
 {
 	struct netlink_ext_ack *extack = cb->extack;
 	struct nlattr *tb[NHA_MAX + 1];
@@ -1670,6 +1749,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
 		case NHA_GROUPS:
 			*group_filter = true;
 			break;
+		case NHA_FDB:
+			*fdb_filter = true;
+			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 			return -EINVAL;
@@ -1688,17 +1770,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
 /* rtnl */
 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	bool group_filter = false, fdb_filter = false;
 	struct nhmsg *nhm = nlmsg_data(cb->nlh);
 	int dev_filter_idx = 0, master_idx = 0;
 	struct net *net = sock_net(skb->sk);
 	struct rb_root *root = &net->nexthop.rb_root;
-	bool group_filter = false;
 	struct rb_node *node;
 	int idx = 0, s_idx;
 	int err;
 
 	err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
-				&group_filter, cb);
+				&group_filter, &fdb_filter, cb);
 	if (err < 0)
 		return err;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a52ec1b86432..82cbb46a2a4f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	fib6_nh->last_probe = jiffies;
 #endif
+	if (cfg->fc_is_fdb) {
+		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+		fib6_nh->fib_nh_gw_family = AF_INET6;
+		return 0;
+	}
 
 	err = -ENODEV;
 	if (cfg->fc_ifindex) {
-- 
cgit v1.2.3


From 1274e1cc42264d4e629841e4f182795cb0becfd2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 21 May 2020 22:26:14 -0700
Subject: vxlan: ecmp support for mac fdb entries

Todays vxlan mac fdb entries can point to multiple remote
ips (rdsts) with the sole purpose of replicating
broadcast-multicast and unknown unicast packets to those remote ips.

E-VPN multihoming [1,2,3] requires bridged vxlan traffic to be
load balanced to remote switches (vteps) belonging to the
same multi-homed ethernet segment (E-VPN multihoming is analogous
to multi-homed LAG implementations, but with the inter-switch
peerlink replaced with a vxlan tunnel). In other words it needs
support for mac ecmp. Furthermore, for faster convergence, E-VPN
multihoming needs the ability to update fdb ecmp nexthops independent
of the fdb entries.

New route nexthop API is perfect for this usecase.
This patch extends the vxlan fdb code to take a nexthop id
pointing to an ecmp nexthop group.

Changes include:
- New NDA_NH_ID attribute for fdbs
- Use the newly added fdb nexthop groups
- makes vxlan rdsts and nexthop handling code mutually
  exclusive
- since this is a new use-case and the requirement is for ecmp
nexthop groups, the fdb add and update path checks that the
nexthop is really an ecmp nexthop group. This check can be relaxed
in the future, if we want to introduce replication fdb nexthop groups
and allow its use in lieu of current rdst lists.
- fdb update requests with nexthop id's only allowed for existing
fdb's that have nexthop id's
- learning will not override an existing fdb entry with nexthop
group
- I have wrapped the switchdev offload code around the presence of
rdst

[1] E-VPN RFC https://tools.ietf.org/html/rfc7432
[2] E-VPN with vxlan https://tools.ietf.org/html/rfc8365
[3] http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf

Includes a null check fix in vxlan_xmit from Nikolay

v2 - Fixed build issue:
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c            | 306 +++++++++++++++++++++++++++++++++--------
 include/net/vxlan.h            |  25 ++++
 include/uapi/linux/neighbour.h |   1 +
 net/core/neighbour.c           |   2 +
 4 files changed, 275 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a5b415fed11e..754e00240eea 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -26,6 +26,7 @@
 #include <net/netns/generic.h>
 #include <net/tun_proto.h>
 #include <net/vxlan.h>
+#include <net/nexthop.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ip6_tunnel.h>
@@ -78,6 +79,8 @@ struct vxlan_fdb {
 	u16		  state;	/* see ndm_state */
 	__be32		  vni;
 	u16		  flags;	/* see ndm_flags and below */
+	struct list_head  nh_list;
+	struct nexthop __rcu *nh;
 };
 
 #define NTF_VXLAN_ADDED_BY_USER 0x100
@@ -174,11 +177,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
  */
 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
 {
+	if (rcu_access_pointer(fdb->nh))
+		return NULL;
 	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
 }
 
 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
 {
+	if (rcu_access_pointer(fdb->nh))
+		return NULL;
 	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
 }
 
@@ -251,9 +258,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 {
 	unsigned long now = jiffies;
 	struct nda_cacheinfo ci;
+	bool send_ip, send_eth;
 	struct nlmsghdr *nlh;
+	struct nexthop *nh;
 	struct ndmsg *ndm;
-	bool send_ip, send_eth;
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 	if (nlh == NULL)
@@ -264,16 +272,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	send_eth = send_ip = true;
 
+	nh = rcu_dereference_rtnl(fdb->nh);
 	if (type == RTM_GETNEIGH) {
-		send_ip = !vxlan_addr_any(&rdst->remote_ip);
+		if (rdst) {
+			send_ip = !vxlan_addr_any(&rdst->remote_ip);
+			ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
+		} else if (nh) {
+			ndm->ndm_family = nexthop_get_family(nh);
+		}
 		send_eth = !is_zero_ether_addr(fdb->eth_addr);
-		ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
 	} else
 		ndm->ndm_family	= AF_BRIDGE;
 	ndm->ndm_state = fdb->state;
 	ndm->ndm_ifindex = vxlan->dev->ifindex;
 	ndm->ndm_flags = fdb->flags;
-	if (rdst->offloaded)
+	if (rdst && rdst->offloaded)
 		ndm->ndm_flags |= NTF_OFFLOADED;
 	ndm->ndm_type = RTN_UNICAST;
 
@@ -284,23 +297,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 		goto nla_put_failure;
+	if (nh) {
+		if (nla_put_u32(skb, NDA_NH_ID, nh->id))
+			goto nla_put_failure;
+	} else if (rdst) {
+		if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
+						  &rdst->remote_ip))
+			goto nla_put_failure;
+
+		if (rdst->remote_port &&
+		    rdst->remote_port != vxlan->cfg.dst_port &&
+		    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
+			goto nla_put_failure;
+		if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
+		    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
+			goto nla_put_failure;
+		if (rdst->remote_ifindex &&
+		    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
+			goto nla_put_failure;
+	}
 
-	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
-		goto nla_put_failure;
-
-	if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
-	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
-		goto nla_put_failure;
-	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
-	    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
-		goto nla_put_failure;
 	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
 	    nla_put_u32(skb, NDA_SRC_VNI,
 			be32_to_cpu(fdb->vni)))
 		goto nla_put_failure;
-	if (rdst->remote_ifindex &&
-	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
-		goto nla_put_failure;
 
 	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
 	ci.ndm_confirmed = 0;
@@ -401,7 +421,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
 {
 	int err;
 
-	if (swdev_notify) {
+	if (swdev_notify && rd) {
 		switch (type) {
 		case RTM_NEWNEIGH:
 			err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
@@ -805,6 +825,8 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state,
 	f->flags = ndm_flags;
 	f->updated = f->used = jiffies;
 	f->vni = src_vni;
+	f->nh = NULL;
+	INIT_LIST_HEAD(&f->nh_list);
 	INIT_LIST_HEAD(&f->remotes);
 	memcpy(f->eth_addr, mac, ETH_ALEN);
 
@@ -819,11 +841,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
 			   vxlan_fdb_head(vxlan, mac, src_vni));
 }
 
+static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+			       u32 nhid, struct netlink_ext_ack *extack)
+{
+	struct nexthop *old_nh = rtnl_dereference(fdb->nh);
+	struct nh_group *nhg;
+	struct nexthop *nh;
+	int err = -EINVAL;
+
+	if (old_nh && old_nh->id == nhid)
+		return 0;
+
+	nh = nexthop_find_by_id(vxlan->net, nhid);
+	if (!nh) {
+		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+		goto err_inval;
+	}
+
+	if (nh) {
+		if (!nexthop_get(nh)) {
+			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+			nh = NULL;
+			goto err_inval;
+		}
+		if (!nh->is_fdb_nh) {
+			NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
+			goto err_inval;
+		}
+
+		if (!nh->is_group || !nh->nh_grp->mpath) {
+			NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
+			goto err_inval;
+		}
+
+		/* check nexthop group family */
+		nhg = rtnl_dereference(nh->nh_grp);
+		switch (vxlan->default_dst.remote_ip.sa.sa_family) {
+		case AF_INET:
+			if (!nhg->has_v4) {
+				err = -EAFNOSUPPORT;
+				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+				goto err_inval;
+			}
+			break;
+		case AF_INET6:
+			if (nhg->has_v4) {
+				err = -EAFNOSUPPORT;
+				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+				goto err_inval;
+			}
+		}
+	}
+
+	if (old_nh) {
+		list_del_rcu(&fdb->nh_list);
+		nexthop_put(old_nh);
+	}
+	rcu_assign_pointer(fdb->nh, nh);
+	list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
+	return 1;
+
+err_inval:
+	if (nh)
+		nexthop_put(nh);
+	return err;
+}
+
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __be16 port, __be32 src_vni,
 			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
-			    struct vxlan_fdb **fdb)
+			    u32 nhid, struct vxlan_fdb **fdb,
+			    struct netlink_ext_ack *extack)
 {
 	struct vxlan_rdst *rd = NULL;
 	struct vxlan_fdb *f;
@@ -838,20 +927,33 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 	if (!f)
 		return -ENOMEM;
 
-	rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
-	if (rc < 0) {
-		kfree(f);
-		return rc;
-	}
+	if (nhid)
+		rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+	else
+		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
+	if (rc < 0)
+		goto errout;
 
 	*fdb = f;
 
 	return 0;
+
+errout:
+	kfree(f);
+	return rc;
 }
 
 static void __vxlan_fdb_free(struct vxlan_fdb *f)
 {
 	struct vxlan_rdst *rd, *nd;
+	struct nexthop *nh;
+
+	nh = rcu_dereference_raw(f->nh);
+	if (nh) {
+		rcu_assign_pointer(f->nh, NULL);
+		list_del_rcu(&f->nh_list);
+		nexthop_put(nh);
+	}
 
 	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
 		dst_cache_destroy(&rd->dst_cache);
@@ -875,10 +977,15 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
 	netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);
 
 	--vxlan->addrcnt;
-	if (do_notify)
-		list_for_each_entry(rd, &f->remotes, list)
-			vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+	if (do_notify) {
+		if (rcu_access_pointer(f->nh))
+			vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
 					 swdev_notify, NULL);
+		else
+			list_for_each_entry(rd, &f->remotes, list)
+				vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+						 swdev_notify, NULL);
+	}
 
 	hlist_del_rcu(&f->hlist);
 	call_rcu(&f->rcu, vxlan_fdb_free);
@@ -897,7 +1004,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
 				     __u16 state, __u16 flags,
 				     __be16 port, __be32 vni,
 				     __u32 ifindex, __u16 ndm_flags,
-				     struct vxlan_fdb *f,
+				     struct vxlan_fdb *f, u32 nhid,
 				     bool swdev_notify,
 				     struct netlink_ext_ack *extack)
 {
@@ -908,6 +1015,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
 	int rc = 0;
 	int err;
 
+	if (nhid && !rcu_access_pointer(f->nh)) {
+		NL_SET_ERR_MSG(extack,
+			       "Cannot replace an existing non nexthop fdb with a nexthop");
+		return -EOPNOTSUPP;
+	}
+
+	if (nhid && (flags & NLM_F_APPEND)) {
+		NL_SET_ERR_MSG(extack,
+			       "Cannot append to a nexthop fdb");
+		return -EOPNOTSUPP;
+	}
+
 	/* Do not allow an externally learned entry to take over an entry added
 	 * by the user.
 	 */
@@ -929,10 +1048,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
 		/* Only change unicasts */
 		if (!(is_multicast_ether_addr(f->eth_addr) ||
 		      is_zero_ether_addr(f->eth_addr))) {
-			rc = vxlan_fdb_replace(f, ip, port, vni,
-					       ifindex, &oldrd);
+			if (nhid) {
+				rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+				if (rc < 0)
+					return rc;
+			} else {
+				rc = vxlan_fdb_replace(f, ip, port, vni,
+						       ifindex, &oldrd);
+			}
 			notify |= rc;
 		} else {
+			NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
 			return -EOPNOTSUPP;
 		}
 	}
@@ -962,6 +1088,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
 	return 0;
 
 err_notify:
+	if (nhid)
+		return err;
 	if ((flags & NLM_F_REPLACE) && rc)
 		*rd = oldrd;
 	else if ((flags & NLM_F_APPEND) && rc) {
@@ -975,7 +1103,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
 				   const u8 *mac, union vxlan_addr *ip,
 				   __u16 state, __u16 flags,
 				   __be16 port, __be32 src_vni, __be32 vni,
-				   __u32 ifindex, __u16 ndm_flags,
+				   __u32 ifindex, __u16 ndm_flags, u32 nhid,
 				   bool swdev_notify,
 				   struct netlink_ext_ack *extack)
 {
@@ -990,7 +1118,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
 
 	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
 	rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
-			      vni, ifindex, fdb_flags, &f);
+			      vni, ifindex, fdb_flags, nhid, &f, extack);
 	if (rc < 0)
 		return rc;
 
@@ -1012,7 +1140,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan,
 			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __u16 flags,
 			    __be16 port, __be32 src_vni, __be32 vni,
-			    __u32 ifindex, __u16 ndm_flags,
+			    __u32 ifindex, __u16 ndm_flags, u32 nhid,
 			    bool swdev_notify,
 			    struct netlink_ext_ack *extack)
 {
@@ -1028,14 +1156,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan,
 
 		return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
 						 vni, ifindex, ndm_flags, f,
-						 swdev_notify, extack);
+						 nhid, swdev_notify, extack);
 	} else {
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
 
 		return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
 					       port, src_vni, vni, ifindex,
-					       ndm_flags, swdev_notify, extack);
+					       ndm_flags, nhid, swdev_notify,
+					       extack);
 	}
 }
 
@@ -1049,7 +1178,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
-			   __be32 *vni, u32 *ifindex)
+			   __be32 *vni, u32 *ifindex, u32 *nhid)
 {
 	struct net *net = dev_net(vxlan->dev);
 	int err;
@@ -1109,6 +1238,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 		*ifindex = 0;
 	}
 
+	if (tb[NDA_NH_ID])
+		*nhid = nla_get_u32(tb[NDA_NH_ID]);
+	else
+		*nhid = 0;
+
 	return 0;
 }
 
@@ -1123,7 +1257,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	union vxlan_addr ip;
 	__be16 port;
 	__be32 src_vni, vni;
-	u32 ifindex;
+	u32 ifindex, nhid;
 	u32 hash_index;
 	int err;
 
@@ -1133,10 +1267,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		return -EINVAL;
 	}
 
-	if (tb[NDA_DST] == NULL)
+	if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
 		return -EINVAL;
 
-	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+			      &nhid);
 	if (err)
 		return err;
 
@@ -1148,7 +1283,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
 			       port, src_vni, vni, ifindex,
 			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
-			       true, extack);
+			       nhid, true, extack);
 	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
@@ -1159,8 +1294,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
 			      __be16 port, __be32 src_vni, __be32 vni,
 			      u32 ifindex, bool swdev_notify)
 {
-	struct vxlan_fdb *f;
 	struct vxlan_rdst *rd = NULL;
+	struct vxlan_fdb *f;
 	int err = -ENOENT;
 
 	f = vxlan_find_mac(vxlan, addr, src_vni);
@@ -1195,12 +1330,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	union vxlan_addr ip;
 	__be32 src_vni, vni;
-	__be16 port;
-	u32 ifindex;
+	u32 ifindex, nhid;
 	u32 hash_index;
+	__be16 port;
 	int err;
 
-	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+			      &nhid);
 	if (err)
 		return err;
 
@@ -1228,6 +1364,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
 			struct vxlan_rdst *rd;
 
+			if (rcu_access_pointer(f->nh)) {
+				err = vxlan_fdb_info(skb, vxlan, f,
+						     NETLINK_CB(cb->skb).portid,
+						     cb->nlh->nlmsg_seq,
+						     RTM_NEWNEIGH,
+						     NLM_F_MULTI, NULL);
+				if (err < 0)
+					goto out;
+				continue;
+			}
+
 			list_for_each_entry_rcu(rd, &f->remotes, list) {
 				if (*idx < cb->args[2])
 					goto skip;
@@ -1311,6 +1458,10 @@ static bool vxlan_snoop(struct net_device *dev,
 		if (f->state & (NUD_PERMANENT | NUD_NOARP))
 			return true;
 
+		/* Don't override an fdb with nexthop with a learnt entry */
+		if (rcu_access_pointer(f->nh))
+			return true;
+
 		if (net_ratelimit())
 			netdev_info(dev,
 				    "%pM migrated from %pIS to %pIS\n",
@@ -1333,7 +1484,7 @@ static bool vxlan_snoop(struct net_device *dev,
 					 vxlan->cfg.dst_port,
 					 vni,
 					 vxlan->default_dst.remote_vni,
-					 ifindex, NTF_SELF, true, NULL);
+					 ifindex, NTF_SELF, 0, true, NULL);
 		spin_unlock(&vxlan->hash_lock[hash_index]);
 	}
 
@@ -2616,6 +2767,38 @@ tx_error:
 	kfree_skb(skb);
 }
 
+static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
+			  struct vxlan_fdb *f, __be32 vni, bool did_rsc)
+{
+	struct vxlan_rdst nh_rdst;
+	struct nexthop *nh;
+	bool do_xmit;
+	u32 hash;
+
+	memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
+	hash = skb_get_hash(skb);
+
+	rcu_read_lock();
+	nh = rcu_dereference(f->nh);
+	if (!nh) {
+		rcu_read_unlock();
+		goto drop;
+	}
+	do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
+	rcu_read_unlock();
+
+	if (likely(do_xmit))
+		vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
+	else
+		goto drop;
+
+	return;
+
+drop:
+	dev->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+}
+
 /* Transmit local packets over Vxlan
  *
  * Outer IP header inherits ECN and DF from inner header.
@@ -2692,22 +2875,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
-	list_for_each_entry_rcu(rdst, &f->remotes, list) {
-		struct sk_buff *skb1;
+	if (rcu_access_pointer(f->nh)) {
+		vxlan_xmit_nh(skb, dev, f,
+			      (vni ? : vxlan->default_dst.remote_vni), did_rsc);
+	} else {
+		list_for_each_entry_rcu(rdst, &f->remotes, list) {
+			struct sk_buff *skb1;
 
-		if (!fdst) {
-			fdst = rdst;
-			continue;
+			if (!fdst) {
+				fdst = rdst;
+				continue;
+			}
+			skb1 = skb_clone(skb, GFP_ATOMIC);
+			if (skb1)
+				vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
 		}
-		skb1 = skb_clone(skb, GFP_ATOMIC);
-		if (skb1)
-			vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
+		if (fdst)
+			vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
+		else
+			kfree_skb(skb);
 	}
 
-	if (fdst)
-		vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
-	else
-		kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
@@ -3615,7 +3803,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 				       dst->remote_vni,
 				       dst->remote_vni,
 				       dst->remote_ifindex,
-				       NTF_SELF, &f);
+				       NTF_SELF, 0, &f, extack);
 		if (err)
 			return err;
 	}
@@ -4013,7 +4201,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
 					       vxlan->cfg.dst_port,
 					       conf.vni, conf.vni,
 					       conf.remote_ifindex,
-					       NTF_SELF, true, extack);
+					       NTF_SELF, 0, true, extack);
 			if (err) {
 				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 				netdev_adjacent_change_abort(dst->remote_dev,
@@ -4335,7 +4523,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev,
 			       fdb_info->remote_vni,
 			       fdb_info->remote_ifindex,
 			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
-			       false, extack);
+			       0, false, extack);
 	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
 	return err;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 373aadcfea21..3a41627cbdfe 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -7,6 +7,7 @@
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
+#include <net/nexthop.h>
 
 #define IANA_VXLAN_UDP_PORT     4789
 
@@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype,
 #undef VXLAN_FLAG
 }
 
+static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
+					    int hash,
+					    struct vxlan_rdst *rdst)
+{
+	struct fib_nh_common *nhc;
+
+	nhc = nexthop_path_fdb_result(nh, hash);
+	if (unlikely(!nhc))
+		return false;
+
+	switch (nhc->nhc_gw_family) {
+	case AF_INET:
+		rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4;
+		rdst->remote_ip.sa.sa_family = AF_INET;
+		break;
+	case AF_INET6:
+		rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6;
+		rdst->remote_ip.sa.sa_family = AF_INET6;
+		break;
+	}
+
+	return true;
+}
+
 #endif
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index cd144e3099a3..eefcda8ca44e 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -29,6 +29,7 @@ enum {
 	NDA_LINK_NETNSID,
 	NDA_SRC_VNI,
 	NDA_PROTOCOL,  /* Originator of entry */
+	NDA_NH_ID,
 	__NDA_MAX
 };
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index b607ea602774..37e4dba62460 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family)
 }
 
 const struct nla_policy nda_policy[NDA_MAX+1] = {
+	[NDA_UNSPEC]		= { .strict_start_type = NDA_NH_ID },
 	[NDA_DST]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[NDA_LLADDR]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[NDA_CACHEINFO]		= { .len = sizeof(struct nda_cacheinfo) },
@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
 	[NDA_IFINDEX]		= { .type = NLA_U32 },
 	[NDA_MASTER]		= { .type = NLA_U32 },
 	[NDA_PROTOCOL]		= { .type = NLA_U8 },
+	[NDA_NH_ID]		= { .type = NLA_U32 },
 };
 
 static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
-- 
cgit v1.2.3


From 8590ceedb70181ad9de5a3dc2cfe50ca33a9576a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 21 May 2020 22:26:15 -0700
Subject: nexthop: add support for notifiers

This patch adds nexthop add/del notifiers. To be used by
vxlan driver in a later patch. Could possibly be used by
switchdev drivers in the future.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/nexthop.h |  1 +
 include/net/nexthop.h       | 12 ++++++++++++
 net/ipv4/nexthop.c          | 27 +++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'net')

diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h
index c712ee5eebd9..1937476c94a0 100644
--- a/include/net/netns/nexthop.h
+++ b/include/net/netns/nexthop.h
@@ -14,5 +14,6 @@ struct netns_nexthop {
 
 	unsigned int		seq;		/* protected by rtnl_mutex */
 	u32			last_id_allocated;
+	struct atomic_notifier_head notifier_chain;
 };
 #endif
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d929c98931ad..4c951680f6f9 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -10,6 +10,7 @@
 #define __LINUX_NEXTHOP_H
 
 #include <linux/netdevice.h>
+#include <linux/notifier.h>
 #include <linux/route.h>
 #include <linux/types.h>
 #include <net/ip_fib.h>
@@ -102,6 +103,17 @@ struct nexthop {
 	};
 };
 
+enum nexthop_event_type {
+	NEXTHOP_EVENT_ADD,
+	NEXTHOP_EVENT_DEL
+};
+
+int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+			  enum nexthop_event_type event_type,
+			  struct nexthop *nh);
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb);
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
+
 /* caller is holding rcu or rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
 void nexthop_free_rcu(struct rcu_head *head);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index bf91edc04631..c337e73e02dd 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -36,6 +36,17 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_FDB]		= { .type = NLA_FLAG },
 };
 
+static int call_nexthop_notifiers(struct net *net,
+				  enum fib_event_type event_type,
+				  struct nexthop *nh)
+{
+	int err;
+
+	err = atomic_notifier_call_chain(&net->nexthop.notifier_chain,
+					 event_type, nh);
+	return notifier_to_errno(err);
+}
+
 static unsigned int nh_dev_hashfn(unsigned int val)
 {
 	unsigned int mask = NH_DEV_HASHSIZE - 1;
@@ -826,6 +837,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	bool do_flush = false;
 	struct fib_info *fi;
 
+	call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
+
 	list_for_each_entry(fi, &nh->fi_list, nh_list) {
 		fi->fib_flags |= RTNH_F_DEAD;
 		do_flush = true;
@@ -1865,6 +1878,19 @@ static struct notifier_block nh_netdev_notifier = {
 	.notifier_call = nh_netdev_event,
 };
 
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb);
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain,
+						nb);
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
 static void __net_exit nexthop_net_exit(struct net *net)
 {
 	rtnl_lock();
@@ -1881,6 +1907,7 @@ static int __net_init nexthop_net_init(struct net *net)
 	net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
 	if (!net->nexthop.devhash)
 		return -ENOMEM;
+	ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
 
 	return 0;
 }
-- 
cgit v1.2.3


From e7bb18e6c8b7ed35746af8c7c708447a66ee385c Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 20 May 2020 18:21:35 +0300
Subject: ip6_tunnel: simplify transmit path

Merge ip{4,6}ip6_tnl_xmit functions into one universal
ipxip6_tnl_xmit in preparation for adding MPLS support.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 182 ++++++++++++++++++++++----------------------------
 1 file changed, 79 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4703b09808d0..dae6f7146b49 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1253,22 +1253,22 @@ tx_err_dst_release:
 EXPORT_SYMBOL(ip6_tnl_xmit);
 
 static inline int
-ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+		u8 protocol)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
+	struct ipv6hdr *ipv6h;
 	const struct iphdr  *iph;
 	int encap_limit = -1;
+	__u16 offset;
 	struct flowi6 fl6;
-	__u8 dsfield;
+	__u8 dsfield, orig_dsfield;
 	__u32 mtu;
 	u8 tproto;
 	int err;
 
-	iph = ip_hdr(skb);
-	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
 	tproto = READ_ONCE(t->parms.proto);
-	if (tproto != IPPROTO_IPIP && tproto != 0)
+	if (tproto != protocol && tproto != 0)
 		return -1;
 
 	if (t->parms.collect_md) {
@@ -1281,129 +1281,101 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 			return -1;
 		key = &tun_info->key;
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_proto = IPPROTO_IPIP;
+		fl6.flowi6_proto = protocol;
 		fl6.saddr = key->u.ipv6.src;
 		fl6.daddr = key->u.ipv6.dst;
 		fl6.flowlabel = key->label;
 		dsfield =  key->tos;
+		switch (protocol) {
+		case IPPROTO_IPIP:
+			iph = ip_hdr(skb);
+			orig_dsfield = ipv4_get_dsfield(iph);
+			break;
+		case IPPROTO_IPV6:
+			ipv6h = ipv6_hdr(skb);
+			orig_dsfield = ipv6_get_dsfield(ipv6h);
+			break;
+		default:
+			orig_dsfield = dsfield;
+			break;
+		}
 	} else {
 		if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 			encap_limit = t->parms.encap_limit;
+		if (protocol == IPPROTO_IPV6) {
+			offset = ip6_tnl_parse_tlv_enc_lim(skb,
+						skb_network_header(skb));
+			/* ip6_tnl_parse_tlv_enc_lim() might have
+			 * reallocated skb->head
+			 */
+			if (offset > 0) {
+				struct ipv6_tlv_tnl_enc_lim *tel;
 
-		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-		fl6.flowi6_proto = IPPROTO_IPIP;
-
-		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-			dsfield = ipv4_get_dsfield(iph);
-		else
-			dsfield = ip6_tclass(t->parms.flowinfo);
-		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-			fl6.flowi6_mark = skb->mark;
-		else
-			fl6.flowi6_mark = t->parms.fwmark;
-	}
-
-	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
-	dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
-
-	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
-		return -1;
-
-	skb_set_inner_ipproto(skb, IPPROTO_IPIP);
-
-	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
-			   IPPROTO_IPIP);
-	if (err != 0) {
-		/* XXX: send ICMP error even if DF is not set. */
-		if (err == -EMSGSIZE)
-			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-				  htonl(mtu));
-		return -1;
-	}
-
-	return 0;
-}
-
-static inline int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct ip6_tnl *t = netdev_priv(dev);
-	struct ipv6hdr *ipv6h;
-	int encap_limit = -1;
-	__u16 offset;
-	struct flowi6 fl6;
-	__u8 dsfield;
-	__u32 mtu;
-	u8 tproto;
-	int err;
-
-	ipv6h = ipv6_hdr(skb);
-	tproto = READ_ONCE(t->parms.proto);
-	if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
-	    ip6_tnl_addr_conflict(t, ipv6h))
-		return -1;
-
-	if (t->parms.collect_md) {
-		struct ip_tunnel_info *tun_info;
-		const struct ip_tunnel_key *key;
-
-		tun_info = skb_tunnel_info(skb);
-		if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
-			     ip_tunnel_info_af(tun_info) != AF_INET6))
-			return -1;
-		key = &tun_info->key;
-		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_proto = IPPROTO_IPV6;
-		fl6.saddr = key->u.ipv6.src;
-		fl6.daddr = key->u.ipv6.dst;
-		fl6.flowlabel = key->label;
-		dsfield = key->tos;
-	} else {
-		offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-		/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
-		ipv6h = ipv6_hdr(skb);
-		if (offset > 0) {
-			struct ipv6_tlv_tnl_enc_lim *tel;
-
-			tel = (void *)&skb_network_header(skb)[offset];
-			if (tel->encap_limit == 0) {
-				icmpv6_send(skb, ICMPV6_PARAMPROB,
-					    ICMPV6_HDR_FIELD, offset + 2);
-				return -1;
+				tel = (void *)&skb_network_header(skb)[offset];
+				if (tel->encap_limit == 0) {
+					icmpv6_send(skb, ICMPV6_PARAMPROB,
+						ICMPV6_HDR_FIELD, offset + 2);
+					return -1;
+				}
+				encap_limit = tel->encap_limit - 1;
 			}
-			encap_limit = tel->encap_limit - 1;
-		} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
-			encap_limit = t->parms.encap_limit;
 		}
 
 		memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-		fl6.flowi6_proto = IPPROTO_IPV6;
+		fl6.flowi6_proto = protocol;
 
-		if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-			dsfield = ipv6_get_dsfield(ipv6h);
-		else
-			dsfield = ip6_tclass(t->parms.flowinfo);
-		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-			fl6.flowlabel |= ip6_flowlabel(ipv6h);
 		if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
 			fl6.flowi6_mark = skb->mark;
 		else
 			fl6.flowi6_mark = t->parms.fwmark;
+		switch (protocol) {
+		case IPPROTO_IPIP:
+			iph = ip_hdr(skb);
+			orig_dsfield = ipv4_get_dsfield(iph);
+			if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+				dsfield = orig_dsfield;
+			else
+				dsfield = ip6_tclass(t->parms.flowinfo);
+			break;
+		case IPPROTO_IPV6:
+			ipv6h = ipv6_hdr(skb);
+			orig_dsfield = ipv6_get_dsfield(ipv6h);
+			if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+				dsfield = orig_dsfield;
+			else
+				dsfield = ip6_tclass(t->parms.flowinfo);
+			if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+				fl6.flowlabel |= ip6_flowlabel(ipv6h);
+			break;
+		default:
+			break;
+		}
 	}
 
 	fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
-	dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
+	dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
 
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
 
-	skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+	skb_set_inner_ipproto(skb, protocol);
 
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
-			   IPPROTO_IPV6);
+			   protocol);
 	if (err != 0) {
+		/* XXX: send ICMP error even if DF is not set. */
 		if (err == -EMSGSIZE)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			switch (protocol) {
+			case IPPROTO_IPIP:
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_FRAG_NEEDED, htonl(mtu));
+				break;
+			case IPPROTO_IPV6:
+				icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+				break;
+			default:
+				break;
+			}
 		return -1;
 	}
 
@@ -1415,6 +1387,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct net_device_stats *stats = &t->dev->stats;
+	u8 ipproto;
 	int ret;
 
 	if (!pskb_inet_may_pull(skb))
@@ -1422,15 +1395,18 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		ret = ip4ip6_tnl_xmit(skb, dev);
+		ipproto = IPPROTO_IPIP;
 		break;
 	case htons(ETH_P_IPV6):
-		ret = ip6ip6_tnl_xmit(skb, dev);
+		if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+			goto tx_err;
+		ipproto = IPPROTO_IPV6;
 		break;
 	default:
 		goto tx_err;
 	}
 
+	ret = ipxip6_tnl_xmit(skb, dev, ipproto);
 	if (ret < 0)
 		goto tx_err;
 
-- 
cgit v1.2.3


From 6c11fbf97e69d2164406fc634758f90d34501ece Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 20 May 2020 18:21:36 +0300
Subject: ip6_tunnel: add MPLS transmit support

Add ETH_P_MPLS_UC as supported protocol.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index dae6f7146b49..6b94c870693c 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1232,6 +1232,8 @@ route_lookup:
 		ipv6_push_frag_opts(skb, &opt.ops, &proto);
 	}
 
+	skb_set_inner_ipproto(skb, proto);
+
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
@@ -1348,6 +1350,7 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
 				fl6.flowlabel |= ip6_flowlabel(ipv6h);
 			break;
 		default:
+			orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
 			break;
 		}
 	}
@@ -1358,8 +1361,6 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
 		return -1;
 
-	skb_set_inner_ipproto(skb, protocol);
-
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
 			   protocol);
 	if (err != 0) {
@@ -1402,6 +1403,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			goto tx_err;
 		ipproto = IPPROTO_IPV6;
 		break;
+	case htons(ETH_P_MPLS_UC):
+		ipproto = IPPROTO_MPLS;
+		break;
 	default:
 		goto tx_err;
 	}
-- 
cgit v1.2.3


From f234efac2c6220f32cbc446e75dc1d27b04166c3 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 20 May 2020 18:21:37 +0300
Subject: tunnel6: support for IPPROTO_MPLS

This patch is just preparation for MPLS support in ip6_tunnel

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tunnel6.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 21e7b95ddbfa..06c02ebe6b9b 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -21,8 +21,14 @@
 
 static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
 static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel6_mutex);
 
+static inline int xfrm6_tunnel_mpls_supported(void)
+{
+	return IS_ENABLED(CONFIG_MPLS);
+}
+
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 {
 	struct xfrm6_tunnel __rcu **pprev;
@@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 
 	mutex_lock(&tunnel6_mutex);
 
-	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-	     (t = rcu_dereference_protected(*pprev,
+	switch (family) {
+	case AF_INET6:
+		pprev = &tunnel6_handlers;
+		break;
+	case AF_INET:
+		pprev = &tunnel46_handlers;
+		break;
+	case AF_MPLS:
+		pprev = &tunnelmpls6_handlers;
+		break;
+	default:
+		goto err;
+	}
+
+	for (; (t = rcu_dereference_protected(*pprev,
 			lockdep_is_held(&tunnel6_mutex))) != NULL;
 	     pprev = &t->next) {
 		if (t->priority > priority)
@@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 
 	mutex_lock(&tunnel6_mutex);
 
-	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-	     (t = rcu_dereference_protected(*pprev,
+	switch (family) {
+	case AF_INET6:
+		pprev = &tunnel6_handlers;
+		break;
+	case AF_INET:
+		pprev = &tunnel46_handlers;
+		break;
+	case AF_MPLS:
+		pprev = &tunnelmpls6_handlers;
+		break;
+	default:
+		goto err;
+	}
+
+	for (; (t = rcu_dereference_protected(*pprev,
 			lockdep_is_held(&tunnel6_mutex))) != NULL;
 	     pprev = &t->next) {
 		if (t == handler) {
@@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 		}
 	}
 
+err:
 	mutex_unlock(&tunnel6_mutex);
 
 	synchronize_net();
@@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 	     handler != NULL;				\
 	     handler = rcu_dereference(handler->next))	\
 
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+	struct xfrm6_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static int tunnel6_rcv(struct sk_buff *skb)
 {
 	struct xfrm6_tunnel *handler;
@@ -146,6 +197,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return -ENOENT;
 }
 
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			   u8 type, u8 code, int offset, __be32 info)
+{
+	struct xfrm6_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+		if (!handler->err_handler(skb, opt, type, code, offset, info))
+			return 0;
+
+	return -ENOENT;
+}
+
 static const struct inet6_protocol tunnel6_protocol = {
 	.handler	= tunnel6_rcv,
 	.err_handler	= tunnel6_err,
@@ -158,6 +221,12 @@ static const struct inet6_protocol tunnel46_protocol = {
 	.flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+static const struct inet6_protocol tunnelmpls6_protocol = {
+	.handler	= tunnelmpls6_rcv,
+	.err_handler	= tunnelmpls6_err,
+	.flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
 static int __init tunnel6_init(void)
 {
 	if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
@@ -169,6 +238,13 @@ static int __init tunnel6_init(void)
 		inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
 		return -EAGAIN;
 	}
+	if (xfrm6_tunnel_mpls_supported() &&
+	    inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+		pr_err("%s: can't add protocol\n", __func__);
+		inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+		inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+		return -EAGAIN;
+	}
 	return 0;
 }
 
@@ -178,6 +254,9 @@ static void __exit tunnel6_fini(void)
 		pr_err("%s: can't remove protocol\n", __func__);
 	if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
 		pr_err("%s: can't remove protocol\n", __func__);
+	if (xfrm6_tunnel_mpls_supported() &&
+	    inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+		pr_err("%s: can't remove protocol\n", __func__);
 }
 
 module_init(tunnel6_init);
-- 
cgit v1.2.3


From f200e98d9716ce52464838dbaa2856e5ecc52194 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 20 May 2020 18:21:38 +0300
Subject: ip6_tunnel: add generic MPLS receive support

Add support for MPLS in receive side.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6b94c870693c..821d96c720b9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -89,6 +89,11 @@ struct ip6_tnl_net {
 	struct ip6_tnl __rcu *collect_md_tun;
 };
 
+static inline int ip6_tnl_mpls_supported(void)
+{
+	return IS_ENABLED(CONFIG_MPLS);
+}
+
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 {
 	struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -718,6 +723,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+	    u8 type, u8 code, int offset, __be32 info)
+{
+	__u32 rel_info = ntohl(info);
+	int err, rel_msg = 0;
+	u8 rel_type = type;
+	u8 rel_code = code;
+
+	err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+			  &rel_msg, &rel_info, offset);
+	return err;
+}
+
 static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
 				       const struct ipv6hdr *ipv6h,
 				       struct sk_buff *skb)
@@ -740,6 +759,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
 	return IP6_ECN_decapsulate(ipv6h, skb);
 }
 
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+					       const struct ipv6hdr *ipv6h,
+					       struct sk_buff *skb)
+{
+	/* ECN is not supported in AF_MPLS */
+	return 0;
+}
+
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t,
 			     const struct in6_addr *laddr,
 			     const struct in6_addr *raddr)
@@ -901,6 +928,11 @@ static const struct tnl_ptk_info tpi_v4 = {
 	.proto = htons(ETH_P_IP),
 };
 
+static const struct tnl_ptk_info tpi_mpls = {
+	/* no tunnel info required for mplsip6. */
+	.proto = htons(ETH_P_MPLS_UC),
+};
+
 static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 		      const struct tnl_ptk_info *tpi,
 		      int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
@@ -958,6 +990,12 @@ static int ip6ip6_rcv(struct sk_buff *skb)
 			  ip6ip6_dscp_ecn_decapsulate);
 }
 
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+	return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+			  mplsip6_dscp_ecn_decapsulate);
+}
+
 struct ipv6_tel_txoption {
 	struct ipv6_txoptions ops;
 	__u8 dst_opt[8];
@@ -2198,6 +2236,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
 	.priority	=	1,
 };
 
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+	.handler	= mplsip6_rcv,
+	.err_handler	= mplsip6_err,
+	.priority	=	1,
+};
+
 static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
 {
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
@@ -2312,6 +2356,15 @@ static int __init ip6_tunnel_init(void)
 		pr_err("%s: can't register ip6ip6\n", __func__);
 		goto out_ip6ip6;
 	}
+
+	if (ip6_tnl_mpls_supported()) {
+		err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+		if (err < 0) {
+			pr_err("%s: can't register mplsip6\n", __func__);
+			goto out_mplsip6;
+		}
+	}
+
 	err = rtnl_link_register(&ip6_link_ops);
 	if (err < 0)
 		goto rtnl_link_failed;
@@ -2319,6 +2372,9 @@ static int __init ip6_tunnel_init(void)
 	return 0;
 
 rtnl_link_failed:
+	if (ip6_tnl_mpls_supported())
+		xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
 	xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
 out_ip6ip6:
 	xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
@@ -2341,6 +2397,9 @@ static void __exit ip6_tunnel_cleanup(void)
 	if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
 		pr_info("%s: can't deregister ip6ip6\n", __func__);
 
+	if (ip6_tnl_mpls_supported() &&
+	    xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+		pr_info("%s: can't deregister mplsip6\n", __func__);
 	unregister_pernet_device(&ip6_tnl_net_ops);
 }
 
-- 
cgit v1.2.3


From 1515aa70c04151676a7dfefecfcf1d36ff52bc67 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 20 May 2020 18:21:39 +0300
Subject: mpls: Add support for IPv6 tunnels

Add support for IPv6 tunnel devices in AF_MPLS.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index a42e4ed5ab0e..fd30ea61336e 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1593,7 +1593,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 		    dev->type == ARPHRD_IPGRE ||
 		    dev->type == ARPHRD_IP6GRE ||
 		    dev->type == ARPHRD_SIT ||
-		    dev->type == ARPHRD_TUNNEL) {
+		    dev->type == ARPHRD_TUNNEL ||
+		    dev->type == ARPHRD_TUNNEL6) {
 			mdev = mpls_add_dev(dev);
 			if (IS_ERR(mdev))
 				return notifier_from_errno(PTR_ERR(mdev));
-- 
cgit v1.2.3


From 060b6381efe58478e1d7dfff7a1e76a73a6377db Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Wed, 20 May 2020 19:18:10 +0100
Subject: net: flow_offload: simplify hw stats check handling

Make FLOW_ACTION_HW_STATS_DONT_CARE be all bits, rather than none, so that
 drivers and __flow_action_hw_stats_check can use simple bitwise checks.

Pre-fill all actions with DONT_CARE in flow_rule_alloc(), rather than
 relying on implicit semantics of zero from kzalloc, so that callers which
 don't configure action stats themselves (i.e. netfilter) get the correct
 behaviour by default.

Only the kernel's internal API semantics change; the TC uAPI is unaffected.

v4: move DONT_CARE setting to flow_rule_alloc() for robustness and simplicity.

v3: set DONT_CARE in nft and ct offload.

v2: rebased on net-next, removed RFC tags.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c |  8 ++++----
 include/net/flow_offload.h                            | 11 +++++++----
 net/core/flow_offload.c                               |  6 ++++++
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index b286fe158820..51e1b3930c56 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -30,14 +30,14 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 		return -EOPNOTSUPP;
 
 	act = flow_action_first_entry_get(flow_action);
-	if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY ||
-	    act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) {
+	if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) {
+		/* Nothing to do */
+	} else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) {
 		/* Count action is inserted first */
 		err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack);
 		if (err)
 			return err;
-	} else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED &&
-		   act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) {
+	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type");
 		return -EOPNOTSUPP;
 	}
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 4001ffb04f0d..95d633785ef9 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -168,10 +168,11 @@ enum flow_action_hw_stats_bit {
 	FLOW_ACTION_HW_STATS_IMMEDIATE_BIT,
 	FLOW_ACTION_HW_STATS_DELAYED_BIT,
 	FLOW_ACTION_HW_STATS_DISABLED_BIT,
+
+	FLOW_ACTION_HW_STATS_NUM_BITS
 };
 
 enum flow_action_hw_stats {
-	FLOW_ACTION_HW_STATS_DONT_CARE = 0,
 	FLOW_ACTION_HW_STATS_IMMEDIATE =
 		BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT),
 	FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT),
@@ -179,6 +180,7 @@ enum flow_action_hw_stats {
 				   FLOW_ACTION_HW_STATS_DELAYED,
 	FLOW_ACTION_HW_STATS_DISABLED =
 		BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT),
+	FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1,
 };
 
 typedef void (*action_destr)(void *priv);
@@ -340,11 +342,12 @@ __flow_action_hw_stats_check(const struct flow_action *action,
 		return false;
 
 	action_entry = flow_action_first_entry_get(action);
-	if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE)
-		return true;
+
+	/* Zero is not a legal value for hw_stats, catch anyone passing it */
+	WARN_ON_ONCE(!action_entry->hw_stats);
 
 	if (!check_allow_bit &&
-	    action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) {
+	    ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) {
 		NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\"");
 		return false;
 	} else if (check_allow_bit &&
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index e951b743bed3..e64941c526b1 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -8,6 +8,7 @@
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
 	struct flow_rule *rule;
+	int i;
 
 	rule = kzalloc(struct_size(rule, action.entries, num_actions),
 		       GFP_KERNEL);
@@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 		return NULL;
 
 	rule->action.num_entries = num_actions;
+	/* Pre-fill each action hw_stats with DONT_CARE.
+	 * Caller can override this if it wants stats for a given action.
+	 */
+	for (i = 0; i < num_actions; i++)
+		rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
 
 	return rule;
 }
-- 
cgit v1.2.3


From 7aa38018be1fa4cff3e631f26bc821086ba90d29 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 21 May 2020 23:19:05 +0000
Subject: bridge: mrp: Add br_mrp_unique_ifindex function

It is not allow to have the same net bridge port part of multiple MRP
rings. Therefore add a check if the port is used already in a different
MRP. In that case return failure.

Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API")
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mrp.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index d7bc09de4c13..854e31bf0151 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -37,6 +37,26 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
 	return res;
 }
 
+static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
+{
+	struct br_mrp *mrp;
+
+	list_for_each_entry_rcu(mrp, &br->mrp_list, list,
+				lockdep_rtnl_is_held()) {
+		struct net_bridge_port *p;
+
+		p = rtnl_dereference(mrp->p_port);
+		if (p && p->dev->ifindex == ifindex)
+			return false;
+
+		p = rtnl_dereference(mrp->s_port);
+		if (p && p->dev->ifindex == ifindex)
+			return false;
+	}
+
+	return true;
+}
+
 static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
 				       struct net_bridge_port *p)
 {
@@ -255,6 +275,11 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
 	    !br_mrp_get_port(br, instance->s_ifindex))
 		return -EINVAL;
 
+	/* It is not possible to have the same port part of multiple rings */
+	if (!br_mrp_unique_ifindex(br, instance->p_ifindex) ||
+	    !br_mrp_unique_ifindex(br, instance->s_ifindex))
+		return -EINVAL;
+
 	mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
 	if (!mrp)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 4fb13499d3a0cc74cf9820c052481f0ccda2bb23 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 21 May 2020 23:19:07 +0000
Subject: bridge: mrp: Restore port state when deleting MRP instance

When a MRP instance is deleted, then restore the port according to the
bridge state. If the bridge is up then the ports will be in forwarding
state otherwise will be in disabled state.

Fixes: 9a9f26e8f7ea ("bridge: mrp: Connect MRP API with the switchdev API")
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mrp.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index 854e31bf0151..528d767eb026 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -223,6 +223,7 @@ out:
 static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
 {
 	struct net_bridge_port *p;
+	u8 state;
 
 	/* Stop sending MRP_Test frames */
 	cancel_delayed_work_sync(&mrp->test_work);
@@ -234,20 +235,24 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
 	p = rtnl_dereference(mrp->p_port);
 	if (p) {
 		spin_lock_bh(&br->lock);
-		p->state = BR_STATE_FORWARDING;
+		state = netif_running(br->dev) ?
+				BR_STATE_FORWARDING : BR_STATE_DISABLED;
+		p->state = state;
 		p->flags &= ~BR_MRP_AWARE;
 		spin_unlock_bh(&br->lock);
-		br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+		br_mrp_port_switchdev_set_state(p, state);
 		rcu_assign_pointer(mrp->p_port, NULL);
 	}
 
 	p = rtnl_dereference(mrp->s_port);
 	if (p) {
 		spin_lock_bh(&br->lock);
-		p->state = BR_STATE_FORWARDING;
+		state = netif_running(br->dev) ?
+				BR_STATE_FORWARDING : BR_STATE_DISABLED;
+		p->state = state;
 		p->flags &= ~BR_MRP_AWARE;
 		spin_unlock_bh(&br->lock);
-		br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+		br_mrp_port_switchdev_set_state(p, state);
 		rcu_assign_pointer(mrp->s_port, NULL);
 	}
 
-- 
cgit v1.2.3


From 07a7f30819475a6f058df6bba5150c50e7942cfb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 May 2020 13:05:26 -0700
Subject: net: psample: fix build error when CONFIG_INET is not enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix psample build error when CONFIG_INET is not set/enabled by
bracketing the tunnel code in #ifdef CONFIG_NET / #endif.

../net/psample/psample.c: In function ‘__psample_ip_tun_to_nlattr’:
../net/psample/psample.c:216:25: error: implicit declaration of function ‘ip_tunnel_info_opts’; did you mean ‘ip_tunnel_info_opts_set’? [-Werror=implicit-function-declaration]

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Yotam Gigi <yotam.gi@gmail.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/psample/psample.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/psample/psample.c b/net/psample/psample.c
index 34a74043840b..a042261a45c5 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -209,6 +209,7 @@ void psample_group_put(struct psample_group *group)
 }
 EXPORT_SYMBOL_GPL(psample_group_put);
 
+#ifdef CONFIG_INET
 static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
 			      struct ip_tunnel_info *tun_info)
 {
@@ -352,12 +353,15 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
 
 	return sum;
 }
+#endif
 
 void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 			   u32 trunc_size, int in_ifindex, int out_ifindex,
 			   u32 sample_rate)
 {
+#ifdef CONFIG_INET
 	struct ip_tunnel_info *tun_info;
+#endif
 	struct sk_buff *nl_skb;
 	int data_len;
 	int meta_len;
@@ -371,9 +375,11 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 		   nla_total_size(sizeof(u32)) +	/* group_num */
 		   nla_total_size(sizeof(u32));		/* seq */
 
+#ifdef CONFIG_INET
 	tun_info = skb_tunnel_info(skb);
 	if (tun_info)
 		meta_len += psample_tunnel_meta_len(tun_info);
+#endif
 
 	data_len = min(skb->len, trunc_size);
 	if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
@@ -429,11 +435,13 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 			goto error;
 	}
 
+#ifdef CONFIG_INET
 	if (tun_info) {
 		ret = psample_ip_tun_to_nlattr(nl_skb, tun_info);
 		if (unlikely(ret < 0))
 			goto error;
 	}
+#endif
 
 	genlmsg_end(nl_skb, data);
 	genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
-- 
cgit v1.2.3


From cb8a14b205699fee1053a406e1e3fce330b6bdc3 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Sat, 23 May 2020 15:27:08 +0200
Subject: net: move devres helpers into a separate source file

There's currently only a single devres helper in net/ - devm variant
of alloc_etherdev. Let's move it to net/devres.c with the intention of
assing a second one: devm_register_netdev(). This new routine will need
to know the address of the release function of devm_alloc_etherdev() so
that it can verify (using devres_find()) that the struct net_device
that's being passed to it is also resource managed.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/Makefile       |  2 +-
 net/devres.c       | 36 ++++++++++++++++++++++++++++++++++++
 net/ethernet/eth.c | 28 ----------------------------
 3 files changed, 37 insertions(+), 29 deletions(-)
 create mode 100644 net/devres.c

(limited to 'net')

diff --git a/net/Makefile b/net/Makefile
index 07ea48160874..5744bf1997fd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -6,7 +6,7 @@
 # Rewritten to use lists instead of if-statements.
 #
 
-obj-$(CONFIG_NET)		:= socket.o core/
+obj-$(CONFIG_NET)		:= devres.o socket.o core/
 
 tmp-$(CONFIG_COMPAT) 		:= compat.o
 obj-$(CONFIG_NET)		+= $(tmp-y)
diff --git a/net/devres.c b/net/devres.c
new file mode 100644
index 000000000000..c1465d9f9019
--- /dev/null
+++ b/net/devres.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains all networking devres helpers.
+ */
+
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+
+static void devm_free_netdev(struct device *dev, void *res)
+{
+	free_netdev(*(struct net_device **)res);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+					   unsigned int txqs, unsigned int rxqs)
+{
+	struct net_device **dr;
+	struct net_device *netdev;
+
+	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return NULL;
+
+	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+	if (!netdev) {
+		devres_free(dr);
+		return NULL;
+	}
+
+	*dr = netdev;
+	devres_add(dev, dr);
+
+	return netdev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index c8b903302ff2..dac65180c4ef 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -400,34 +400,6 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
-static void devm_free_netdev(struct device *dev, void *res)
-{
-	free_netdev(*(struct net_device **)res);
-}
-
-struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
-					   unsigned int txqs, unsigned int rxqs)
-{
-	struct net_device **dr;
-	struct net_device *netdev;
-
-	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
-	if (!dr)
-		return NULL;
-
-	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
-	if (!netdev) {
-		devres_free(dr);
-		return NULL;
-	}
-
-	*dr = netdev;
-	devres_add(dev, dr);
-
-	return netdev;
-}
-EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
-
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
-- 
cgit v1.2.3


From f75063abc39441585a13fcc5b9ef3af993e9ac40 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Sat, 23 May 2020 15:27:09 +0200
Subject: net: devres: define a separate devres structure for
 devm_alloc_etherdev()

Not using a proxy structure to store struct net_device doesn't save
anything in terms of compiled code size or memory usage but significantly
decreases the readability of the code with all the pointer casting.

Define struct net_device_devres and use it in devm_alloc_etherdev_mqs().

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/devres.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/devres.c b/net/devres.c
index c1465d9f9019..b97b0c5a8216 100644
--- a/net/devres.c
+++ b/net/devres.c
@@ -7,30 +7,34 @@
 #include <linux/etherdevice.h>
 #include <linux/netdevice.h>
 
-static void devm_free_netdev(struct device *dev, void *res)
+struct net_device_devres {
+	struct net_device *ndev;
+};
+
+static void devm_free_netdev(struct device *dev, void *this)
 {
-	free_netdev(*(struct net_device **)res);
+	struct net_device_devres *res = this;
+
+	free_netdev(res->ndev);
 }
 
 struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
 					   unsigned int txqs, unsigned int rxqs)
 {
-	struct net_device **dr;
-	struct net_device *netdev;
+	struct net_device_devres *dr;
 
 	dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
 	if (!dr)
 		return NULL;
 
-	netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
-	if (!netdev) {
+	dr->ndev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+	if (!dr->ndev) {
 		devres_free(dr);
 		return NULL;
 	}
 
-	*dr = netdev;
 	devres_add(dev, dr);
 
-	return netdev;
+	return dr->ndev;
 }
 EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
-- 
cgit v1.2.3


From cd16627fc0468564fdd60f20ad52420b87195127 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Sat, 23 May 2020 15:27:10 +0200
Subject: net: devres: provide devm_register_netdev()

Provide devm_register_netdev() - a device resource managed variant
of register_netdev(). This new helper will only work for net_device
structs that are also already managed by devres.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 include/linux/netdevice.h                        |  2 +
 net/devres.c                                     | 55 ++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

(limited to 'net')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 50df28d20fa7..fc242ed4bde5 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -375,6 +375,7 @@ MUX
 NET
   devm_alloc_etherdev()
   devm_alloc_etherdev_mqs()
+  devm_register_netdev()
 
 PER-CPU MEM
   devm_alloc_percpu()
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a18f8fdf4260..1a96e9c4ec36 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4280,6 +4280,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 int register_netdev(struct net_device *dev);
 void unregister_netdev(struct net_device *dev);
 
+int devm_register_netdev(struct device *dev, struct net_device *ndev);
+
 /* General hardware address lists handling functions */
 int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
 		   struct netdev_hw_addr_list *from_list, int addr_len);
diff --git a/net/devres.c b/net/devres.c
index b97b0c5a8216..57a6a88d11f6 100644
--- a/net/devres.c
+++ b/net/devres.c
@@ -38,3 +38,58 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
 	return dr->ndev;
 }
 EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
+static void devm_netdev_release(struct device *dev, void *this)
+{
+	struct net_device_devres *res = this;
+
+	unregister_netdev(res->ndev);
+}
+
+static int netdev_devres_match(struct device *dev, void *this, void *match_data)
+{
+	struct net_device_devres *res = this;
+	struct net_device *ndev = match_data;
+
+	return ndev == res->ndev;
+}
+
+/**
+ *	devm_register_netdev - resource managed variant of register_netdev()
+ *	@dev: managing device for this netdev - usually the parent device
+ *	@ndev: device to register
+ *
+ *	This is a devres variant of register_netdev() for which the unregister
+ *	function will be call automatically when the managing device is
+ *	detached. Note: the net_device used must also be resource managed by
+ *	the same struct device.
+ */
+int devm_register_netdev(struct device *dev, struct net_device *ndev)
+{
+	struct net_device_devres *dr;
+	int ret;
+
+	/* struct net_device must itself be managed. For now a managed netdev
+	 * can only be allocated by devm_alloc_etherdev_mqs() so the check is
+	 * straightforward.
+	 */
+	if (WARN_ON(!devres_find(dev, devm_free_netdev,
+				 netdev_devres_match, ndev)))
+		return -EINVAL;
+
+	dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	ret = register_netdev(ndev);
+	if (ret) {
+		devres_free(dr);
+		return ret;
+	}
+
+	dr->ndev = ndev;
+	devres_add(ndev->dev.parent, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL(devm_register_netdev);
-- 
cgit v1.2.3


From 316107119f473e764cf5e50437333c8b83bec0da Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 23 May 2020 17:40:25 +0200
Subject: ethtool: propagate get_coalesce return value

get_coalesce returns 0 or ERRNO, but the return value isn't checked.
The returned coalesce data may be invalid if an ERRNO is set,
therefore better check and propagate the return value.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 31e0b4e88a9d..b5df90c981c2 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1510,11 +1510,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+	int ret;
 
 	if (!dev->ethtool_ops->get_coalesce)
 		return -EOPNOTSUPP;
 
-	dev->ethtool_ops->get_coalesce(dev, &coalesce);
+	ret = dev->ethtool_ops->get_coalesce(dev, &coalesce);
+	if (ret)
+		return ret;
 
 	if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
 		return -EFAULT;
-- 
cgit v1.2.3


From 6a1015b0b4b1f3a0de9e40d2ba86877d13f50918 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 23 May 2020 20:46:48 +0300
Subject: ipv4: potential underflow in compat_ip_setsockopt()

The value of "n" is capped at 0x1ffffff but it checked for negative
values.  I don't think this causes a problem but I'm not certain and
it's harmless to prevent it.

Fixes: 2e04172875c9 ("ipv4: do compat setsockopt for MCAST_MSFILTER directly")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index a2469bc57cfe..f43d5f12aa86 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1347,8 +1347,8 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 	{
 		const int size0 = offsetof(struct compat_group_filter, gf_slist);
 		struct compat_group_filter *gf32;
+		unsigned int n;
 		void *p;
-		int n;
 
 		if (optlen < size0)
 			return -EINVAL;
-- 
cgit v1.2.3


From 45af29ca761c275e350cca659856bc56f1035ef9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 24 May 2020 11:00:02 -0700
Subject: tcp: allow traceroute -Mtcp for unpriv users
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unpriv users can use traceroute over plain UDP sockets, but not TCP ones.

$ traceroute -Mtcp 8.8.8.8
You do not have enough privileges to use this traceroute method.

$ traceroute -n -Mudp 8.8.8.8
traceroute to 8.8.8.8 (8.8.8.8), 30 hops max, 60 byte packets
 1  192.168.86.1  3.631 ms  3.512 ms  3.405 ms
 2  10.1.10.1  4.183 ms  4.125 ms  4.072 ms
 3  96.120.88.125  20.621 ms  19.462 ms  20.553 ms
 4  96.110.177.65  24.271 ms  25.351 ms  25.250 ms
 5  69.139.199.197  44.492 ms  43.075 ms  44.346 ms
 6  68.86.143.93  27.969 ms  25.184 ms  25.092 ms
 7  96.112.146.18  25.323 ms 96.112.146.22  25.583 ms 96.112.146.26  24.502 ms
 8  72.14.239.204  24.405 ms 74.125.37.224  16.326 ms  17.194 ms
 9  209.85.251.9  18.154 ms 209.85.247.55  14.449 ms 209.85.251.9  26.296 ms^C

We can easily support traceroute over TCP, by queueing an error message
into socket error queue.

Note that applications need to set IP_RECVERR/IPV6_RECVERR option to
enable this feature, and that the error message is only queued
while in SYN_SNT state.

socket(AF_INET6, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_IPV6, IPV6_RECVERR, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_TIMESTAMP_OLD, [1], 4) = 0
setsockopt(3, SOL_IPV6, IPV6_UNICAST_HOPS, [5], 4) = 0
connect(3, {sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0),
        inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0}, 28) = -1 EHOSTUNREACH (No route to host)
recvmsg(3, {msg_name={sa_family=AF_INET6, sin6_port=htons(8787), sin6_flowinfo=htonl(0),
        inet_pton(AF_INET6, "2002:a05:6608:297::", &sin6_addr), sin6_scope_id=0},
        msg_namelen=1024->28, msg_iov=[{iov_base="`\r\337\320\0004\6\1&\7\370\260\200\231\16\27\0\0\0\0\0\0\0\0 \2\n\5f\10\2\227"..., iov_len=1024}],
        msg_iovlen=1, msg_control=[{cmsg_len=32, cmsg_level=SOL_SOCKET, cmsg_type=SO_TIMESTAMP_OLD, cmsg_data={tv_sec=1590340680, tv_usec=272424}},
                                   {cmsg_len=60, cmsg_level=SOL_IPV6, cmsg_type=IPV6_RECVERR}],
        msg_controllen=96, msg_flags=MSG_ERRQUEUE}, MSG_ERRQUEUE) = 144

Suggested-by: Maciej Żenczykowski <maze@google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 ++
 net/ipv6/tcp_ipv6.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6c05f1ceb538..900c6d154cbc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -573,6 +573,8 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		if (fastopen && !fastopen->sk)
 			break;
 
+		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
+
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 413b3425ac66..01a6f5111a77 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -463,6 +463,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (fastopen && !fastopen->sk)
 			break;
 
+		ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);
+
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
 			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
-- 
cgit v1.2.3


From 617504c67e01d30310558442777a4112ea6d587d Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Mon, 25 May 2020 09:55:41 +0000
Subject: bridge: mrp: Fix out-of-bounds read in br_mrp_parse

The issue was reported by syzbot. When the function br_mrp_parse was
called with a valid net_bridge_port, the net_bridge was an invalid
pointer. Therefore the check br->stp_enabled could pass/fail
depending where it was pointing in memory.
The fix consists of setting the net_bridge pointer if the port is a
valid pointer.

Reported-by: syzbot+9c6f0f1f8e32223df9a4@syzkaller.appspotmail.com
Fixes: 6536993371fa ("bridge: mrp: Integrate MRP into the bridge")
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mrp_netlink.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index 397e7f710772..4a08a99519b0 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -27,6 +27,12 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
 	struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1];
 	int err;
 
+	/* When this function is called for a port then the br pointer is
+	 * invalid, therefor set the br to point correctly
+	 */
+	if (p)
+		br = p->br;
+
 	if (br->stp_enabled != BR_NO_STP) {
 		NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled");
 		return -EINVAL;
-- 
cgit v1.2.3


From 9ad346c90509ebd983f60da7d082f261ad329507 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 25 Nov 2019 10:46:50 +0100
Subject: batman-adv: Revert "disable ethtool link speed detection when auto
 negotiation off"

The commit 8c46fcd78308 ("batman-adv: disable ethtool link speed detection
when auto negotiation off") disabled the usage of ethtool's link_ksetting
when auto negotation was enabled due to invalid values when used with
tun/tap virtual net_devices. According to the patch, automatic measurements
should be used for these kind of interfaces.

But there are major flaws with this argumentation:

* automatic measurements are not implemented
* auto negotiation has nothing to do with the validity of the retrieved
  values

The first point has to be fixed by a longer patch series. The "validity"
part of the second point must be addressed in the same patch series by
dropping the usage of ethtool's link_ksetting (thus always doing automatic
measurements over ethernet).

Drop the patch again to have more default values for various net_device
types/configurations. The user can still overwrite them using the
batadv_hardif's BATADV_ATTR_THROUGHPUT_OVERRIDE.

Reported-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v_elp.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 353e49c40e7f..0bdefa35da98 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -127,20 +127,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
 	rtnl_lock();
 	ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
 	rtnl_unlock();
-
-	/* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
-	 * tend to initialize the interface throughput with some value for the
-	 * sake of having a throughput number to export via ethtool. This
-	 * exported throughput leaves batman-adv to conclude the interface
-	 * throughput is genuine (reflecting reality), thus no measurements
-	 * are necessary.
-	 *
-	 * Based on the observation that those interface types also tend to set
-	 * the link auto-negotiation to 'off', batman-adv shall check this
-	 * setting to differentiate between genuine link throughput information
-	 * and placeholders installed by virtual interfaces.
-	 */
-	if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
+	if (ret == 0) {
 		/* link characteristics might change over time */
 		if (link_settings.base.duplex == DUPLEX_FULL)
 			hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
-- 
cgit v1.2.3


From 239174945dac8cb9613db7755103d5fb6c32241d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 May 2020 20:15:24 -0700
Subject: tcp: tcp_v4_err() icmp skb is named icmp_skb

I missed the fact that tcp_v4_err() differs from tcp_v6_err().

After commit 4d1a2d9ec1c1 ("Rename skb to icmp_skb in tcp_v4_err()")
the skb argument has been renamed to icmp_skb only in one function.

I will in a future patch reconciliate these functions to avoid
this kind of confusion.

Fixes: 45af29ca761c ("tcp: allow traceroute -Mtcp for unpriv users")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 900c6d154cbc..6789671f0f5a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -573,7 +573,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		if (fastopen && !fastopen->sk)
 			break;
 
-		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
+		ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th);
 
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
-- 
cgit v1.2.3


From d7626b5acff9227e2a65da636a53e09bdafdc0aa Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 26 May 2020 16:38:34 +0700
Subject: tipc: introduce Gap ACK blocks for broadcast link

As achieved through commit 9195948fbf34 ("tipc: improve TIPC throughput
by Gap ACK blocks"), we apply the same mechanism for the broadcast link
as well. The 'Gap ACK blocks' data field in a 'PROTOCOL/STATE_MSG' will
consist of two parts built for both the broadcast and unicast types:

 31                       16 15                        0
+-------------+-------------+-------------+-------------+
|  bgack_cnt  |  ugack_cnt  |            len            |
+-------------+-------------+-------------+-------------+  -
|            gap            |            ack            |   |
+-------------+-------------+-------------+-------------+    > bc gacks
:                           :                           :   |
+-------------+-------------+-------------+-------------+  -
|            gap            |            ack            |   |
+-------------+-------------+-------------+-------------+    > uc gacks
:                           :                           :   |
+-------------+-------------+-------------+-------------+  -

which is "automatically" backward-compatible.

We also increase the max number of Gap ACK blocks to 128, allowing upto
64 blocks per type (total buffer size = 516 bytes).

Besides, the 'tipc_link_advance_transmq()' function is refactored which
is applicable for both the unicast and broadcast cases now, so some old
functions can be removed and the code is optimized.

With the patch, TIPC broadcast is more robust regardless of packet loss
or disorder, latency, ... in the underlying network. Its performance is
boost up significantly.
For example, experiment with a 5% packet loss rate results:

$ time tipc-pipe --mc --rdm --data_size 123 --data_num 1500000
real    0m 42.46s
user    0m 1.16s
sys     0m 17.67s

Without the patch:

$ time tipc-pipe --mc --rdm --data_size 123 --data_num 1500000
real    8m 27.94s
user    0m 0.55s
sys     0m 2.38s

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c |   9 +-
 net/tipc/link.c  | 425 ++++++++++++++++++++++++++++++++-----------------------
 net/tipc/link.h  |   7 +-
 net/tipc/msg.h   |  27 +++-
 net/tipc/node.c  |  10 +-
 5 files changed, 293 insertions(+), 185 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 4c20be08b9c4..3ce690a96ee9 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -474,7 +474,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 	__skb_queue_head_init(&xmitq);
 
 	tipc_bcast_lock(net);
-	tipc_link_bc_ack_rcv(l, acked, &xmitq);
+	tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq);
 	tipc_bcast_unlock(net);
 
 	tipc_bcbase_xmit(net, &xmitq);
@@ -492,6 +492,7 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr)
 {
 	struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
+	struct tipc_gap_ack_blks *ga;
 	struct sk_buff_head xmitq;
 	int rc = 0;
 
@@ -501,8 +502,10 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 	if (msg_type(hdr) != STATE_MSG) {
 		tipc_link_bc_init_rcv(l, hdr);
 	} else if (!msg_bc_ack_invalid(hdr)) {
-		tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq);
-		rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq);
+		tipc_get_gap_ack_blks(&ga, l, hdr, false);
+		rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr),
+					  msg_bc_gap(hdr), ga, &xmitq);
+		rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq);
 	}
 	tipc_bcast_unlock(net);
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index d4675e922a8f..d29b9c531171 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -188,6 +188,8 @@ struct tipc_link {
 	/* Broadcast */
 	u16 ackers;
 	u16 acked;
+	u16 last_gap;
+	struct tipc_gap_ack_blks *last_ga;
 	struct tipc_link *bc_rcvlink;
 	struct tipc_link *bc_sndlink;
 	u8 nack_state;
@@ -249,11 +251,14 @@ static int tipc_link_build_nack_msg(struct tipc_link *l,
 				    struct sk_buff_head *xmitq);
 static void tipc_link_build_bc_init_msg(struct tipc_link *l,
 					struct sk_buff_head *xmitq);
-static int tipc_link_release_pkts(struct tipc_link *l, u16 to);
-static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap);
-static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga,
+				    struct tipc_link *l, u8 start_index);
+static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr);
+static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r,
+				     u16 acked, u16 gap,
 				     struct tipc_gap_ack_blks *ga,
-				     struct sk_buff_head *xmitq);
+				     struct sk_buff_head *xmitq,
+				     bool *retransmitted, int *rc);
 static void tipc_link_update_cwin(struct tipc_link *l, int released,
 				  bool retransmitted);
 /*
@@ -370,7 +375,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
 	snd_l->ackers--;
 	rcv_l->bc_peer_is_up = true;
 	rcv_l->state = LINK_ESTABLISHED;
-	tipc_link_bc_ack_rcv(rcv_l, ack, xmitq);
+	tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq);
 	trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!");
 	tipc_link_reset(rcv_l);
 	rcv_l->state = LINK_RESET;
@@ -784,8 +789,6 @@ bool tipc_link_too_silent(struct tipc_link *l)
 	return (l->silent_intv_cnt + 2 > l->abort_limit);
 }
 
-static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
-				u16 from, u16 to, struct sk_buff_head *xmitq);
 /* tipc_link_timeout - perform periodic task as instructed from node timeout
  */
 int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -948,6 +951,9 @@ void tipc_link_reset(struct tipc_link *l)
 	l->snd_nxt_state = 1;
 	l->rcv_nxt_state = 1;
 	l->acked = 0;
+	l->last_gap = 0;
+	kfree(l->last_ga);
+	l->last_ga = NULL;
 	l->silent_intv_cnt = 0;
 	l->rst_cnt = 0;
 	l->bc_peer_is_up = false;
@@ -1183,68 +1189,14 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
 
 	if (link_is_bc_sndlink(l)) {
 		r->state = LINK_RESET;
-		*rc = TIPC_LINK_DOWN_EVT;
+		*rc |= TIPC_LINK_DOWN_EVT;
 	} else {
-		*rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+		*rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
 	}
 
 	return true;
 }
 
-/* tipc_link_bc_retrans() - retransmit zero or more packets
- * @l: the link to transmit on
- * @r: the receiving link ordering the retransmit. Same as l if unicast
- * @from: retransmit from (inclusive) this sequence number
- * @to: retransmit to (inclusive) this sequence number
- * xmitq: queue for accumulating the retransmitted packets
- */
-static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
-				u16 from, u16 to, struct sk_buff_head *xmitq)
-{
-	struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
-	u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
-	u16 ack = l->rcv_nxt - 1;
-	int retransmitted = 0;
-	struct tipc_msg *hdr;
-	int rc = 0;
-
-	if (!skb)
-		return 0;
-	if (less(to, from))
-		return 0;
-
-	trace_tipc_link_retrans(r, from, to, &l->transmq);
-
-	if (link_retransmit_failure(l, r, &rc))
-		return rc;
-
-	skb_queue_walk(&l->transmq, skb) {
-		hdr = buf_msg(skb);
-		if (less(msg_seqno(hdr), from))
-			continue;
-		if (more(msg_seqno(hdr), to))
-			break;
-		if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
-			continue;
-		TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
-		_skb = pskb_copy(skb, GFP_ATOMIC);
-		if (!_skb)
-			return 0;
-		hdr = buf_msg(_skb);
-		msg_set_ack(hdr, ack);
-		msg_set_bcast_ack(hdr, bc_ack);
-		_skb->priority = TC_PRIO_CONTROL;
-		__skb_queue_tail(xmitq, _skb);
-		l->stats.retransmitted++;
-		retransmitted++;
-		/* Increase actual retrans counter & mark first time */
-		if (!TIPC_SKB_CB(skb)->retr_cnt++)
-			TIPC_SKB_CB(skb)->retr_stamp = jiffies;
-	}
-	tipc_link_update_cwin(l, 0, retransmitted);
-	return 0;
-}
-
 /* tipc_data_input - deliver data and name distr msgs to upper layer
  *
  * Consumes buffer if message is of right type
@@ -1402,46 +1354,71 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
 	return rc;
 }
 
-static int tipc_link_release_pkts(struct tipc_link *l, u16 acked)
-{
-	int released = 0;
-	struct sk_buff *skb, *tmp;
-
-	skb_queue_walk_safe(&l->transmq, skb, tmp) {
-		if (more(buf_seqno(skb), acked))
-			break;
-		__skb_unlink(skb, &l->transmq);
-		kfree_skb(skb);
-		released++;
+/**
+ * tipc_get_gap_ack_blks - get Gap ACK blocks from PROTOCOL/STATE_MSG
+ * @ga: returned pointer to the Gap ACK blocks if any
+ * @l: the tipc link
+ * @hdr: the PROTOCOL/STATE_MSG header
+ * @uc: desired Gap ACK blocks type, i.e. unicast (= 1) or broadcast (= 0)
+ *
+ * Return: the total Gap ACK blocks size
+ */
+u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l,
+			  struct tipc_msg *hdr, bool uc)
+{
+	struct tipc_gap_ack_blks *p;
+	u16 sz = 0;
+
+	/* Does peer support the Gap ACK blocks feature? */
+	if (l->peer_caps & TIPC_GAP_ACK_BLOCK) {
+		p = (struct tipc_gap_ack_blks *)msg_data(hdr);
+		sz = ntohs(p->len);
+		/* Sanity check */
+		if (sz == tipc_gap_ack_blks_sz(p->ugack_cnt + p->bgack_cnt)) {
+			/* Good, check if the desired type exists */
+			if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt))
+				goto ok;
+		/* Backward compatible: peer might not support bc, but uc? */
+		} else if (uc && sz == tipc_gap_ack_blks_sz(p->ugack_cnt)) {
+			if (p->ugack_cnt) {
+				p->bgack_cnt = 0;
+				goto ok;
+			}
+		}
 	}
-	return released;
+	/* Other cases: ignore! */
+	p = NULL;
+
+ok:
+	*ga = p;
+	return sz;
 }
 
-/* tipc_build_gap_ack_blks - build Gap ACK blocks
- * @l: tipc link that data have come with gaps in sequence if any
- * @data: data buffer to store the Gap ACK blocks after built
- *
- * returns the actual allocated memory size
- */
-static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap)
+static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga,
+				    struct tipc_link *l, u8 start_index)
 {
+	struct tipc_gap_ack *gacks = &ga->gacks[start_index];
 	struct sk_buff *skb = skb_peek(&l->deferdq);
-	struct tipc_gap_ack_blks *ga = data;
-	u16 len, expect, seqno = 0;
+	u16 expect, seqno = 0;
 	u8 n = 0;
 
-	if (!skb || !gap)
-		goto exit;
+	if (!skb)
+		return 0;
 
 	expect = buf_seqno(skb);
 	skb_queue_walk(&l->deferdq, skb) {
 		seqno = buf_seqno(skb);
 		if (unlikely(more(seqno, expect))) {
-			ga->gacks[n].ack = htons(expect - 1);
-			ga->gacks[n].gap = htons(seqno - expect);
-			if (++n >= MAX_GAP_ACK_BLKS) {
-				pr_info_ratelimited("Too few Gap ACK blocks!\n");
-				goto exit;
+			gacks[n].ack = htons(expect - 1);
+			gacks[n].gap = htons(seqno - expect);
+			if (++n >= MAX_GAP_ACK_BLKS / 2) {
+				char buf[TIPC_MAX_LINK_NAME];
+
+				pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n",
+						    tipc_link_name_ext(l, buf),
+						    n,
+						    skb_queue_len(&l->deferdq));
+				return n;
 			}
 		} else if (unlikely(less(seqno, expect))) {
 			pr_warn("Unexpected skb in deferdq!\n");
@@ -1451,14 +1428,44 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap)
 	}
 
 	/* last block */
-	ga->gacks[n].ack = htons(seqno);
-	ga->gacks[n].gap = 0;
+	gacks[n].ack = htons(seqno);
+	gacks[n].gap = 0;
 	n++;
+	return n;
+}
 
-exit:
-	len = tipc_gap_ack_blks_sz(n);
+/* tipc_build_gap_ack_blks - build Gap ACK blocks
+ * @l: tipc unicast link
+ * @hdr: the tipc message buffer to store the Gap ACK blocks after built
+ *
+ * The function builds Gap ACK blocks for both the unicast & broadcast receiver
+ * links of a certain peer, the buffer after built has the network data format
+ * as found at the struct tipc_gap_ack_blks definition.
+ *
+ * returns the actual allocated memory size
+ */
+static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr)
+{
+	struct tipc_link *bcl = l->bc_rcvlink;
+	struct tipc_gap_ack_blks *ga;
+	u16 len;
+
+	ga = (struct tipc_gap_ack_blks *)msg_data(hdr);
+
+	/* Start with broadcast link first */
+	tipc_bcast_lock(bcl->net);
+	msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1);
+	msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
+	ga->bgack_cnt = __tipc_build_gap_ack_blks(ga, bcl, 0);
+	tipc_bcast_unlock(bcl->net);
+
+	/* Now for unicast link, but an explicit NACK only (???) */
+	ga->ugack_cnt = (msg_seq_gap(hdr)) ?
+			__tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0;
+
+	/* Total len */
+	len = tipc_gap_ack_blks_sz(ga->bgack_cnt + ga->ugack_cnt);
 	ga->len = htons(len);
-	ga->gack_cnt = n;
 	return len;
 }
 
@@ -1466,47 +1473,109 @@ exit:
  *			       acked packets, also doing retransmissions if
  *			       gaps found
  * @l: tipc link with transmq queue to be advanced
+ * @r: tipc link "receiver" i.e. in case of broadcast (= "l" if unicast)
  * @acked: seqno of last packet acked by peer without any gaps before
  * @gap: # of gap packets
  * @ga: buffer pointer to Gap ACK blocks from peer
  * @xmitq: queue for accumulating the retransmitted packets if any
+ * @retransmitted: returned boolean value if a retransmission is really issued
+ * @rc: returned code e.g. TIPC_LINK_DOWN_EVT if a repeated retransmit failures
+ *      happens (- unlikely case)
  *
- * In case of a repeated retransmit failures, the call will return shortly
- * with a returned code (e.g. TIPC_LINK_DOWN_EVT)
+ * Return: the number of packets released from the link transmq
  */
-static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
+static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r,
+				     u16 acked, u16 gap,
 				     struct tipc_gap_ack_blks *ga,
-				     struct sk_buff_head *xmitq)
+				     struct sk_buff_head *xmitq,
+				     bool *retransmitted, int *rc)
 {
+	struct tipc_gap_ack_blks *last_ga = r->last_ga, *this_ga = NULL;
+	struct tipc_gap_ack *gacks = NULL;
 	struct sk_buff *skb, *_skb, *tmp;
 	struct tipc_msg *hdr;
+	u32 qlen = skb_queue_len(&l->transmq);
+	u16 nacked = acked, ngap = gap, gack_cnt = 0;
 	u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
-	bool retransmitted = false;
 	u16 ack = l->rcv_nxt - 1;
-	bool passed = false;
-	u16 released = 0;
 	u16 seqno, n = 0;
-	int rc = 0;
+	u16 end = r->acked, start = end, offset = r->last_gap;
+	u16 si = (last_ga) ? last_ga->start_index : 0;
+	bool is_uc = !link_is_bc_sndlink(l);
+	bool bc_has_acked = false;
+
+	/* Determine Gap ACK blocks if any for the particular link */
+	if (ga && is_uc) {
+		/* Get the Gap ACKs, uc part */
+		gack_cnt = ga->ugack_cnt;
+		gacks = &ga->gacks[ga->bgack_cnt];
+	} else if (ga) {
+		/* Copy the Gap ACKs, bc part, for later renewal if needed */
+		this_ga = kmemdup(ga, tipc_gap_ack_blks_sz(ga->bgack_cnt),
+				  GFP_ATOMIC);
+		if (likely(this_ga)) {
+			this_ga->start_index = 0;
+			/* Start with the bc Gap ACKs */
+			gack_cnt = this_ga->bgack_cnt;
+			gacks = &this_ga->gacks[0];
+		} else {
+			/* Hmm, we can get in trouble..., simply ignore it */
+			pr_warn_ratelimited("Ignoring bc Gap ACKs, no memory\n");
+		}
+	}
 
+	/* Advance the link transmq */
 	skb_queue_walk_safe(&l->transmq, skb, tmp) {
 		seqno = buf_seqno(skb);
 
 next_gap_ack:
-		if (less_eq(seqno, acked)) {
+		if (less_eq(seqno, nacked)) {
+			if (is_uc)
+				goto release;
+			/* Skip packets peer has already acked */
+			if (!more(seqno, r->acked))
+				continue;
+			/* Get the next of last Gap ACK blocks */
+			while (more(seqno, end)) {
+				if (!last_ga || si >= last_ga->bgack_cnt)
+					break;
+				start = end + offset + 1;
+				end = ntohs(last_ga->gacks[si].ack);
+				offset = ntohs(last_ga->gacks[si].gap);
+				si++;
+				WARN_ONCE(more(start, end) ||
+					  (!offset &&
+					   si < last_ga->bgack_cnt) ||
+					  si > MAX_GAP_ACK_BLKS,
+					  "Corrupted Gap ACK: %d %d %d %d %d\n",
+					  start, end, offset, si,
+					  last_ga->bgack_cnt);
+			}
+			/* Check against the last Gap ACK block */
+			if (in_range(seqno, start, end))
+				continue;
+			/* Update/release the packet peer is acking */
+			bc_has_acked = true;
+			if (--TIPC_SKB_CB(skb)->ackers)
+				continue;
+release:
 			/* release skb */
 			__skb_unlink(skb, &l->transmq);
 			kfree_skb(skb);
-			released++;
-		} else if (less_eq(seqno, acked + gap)) {
-			/* First, check if repeated retrans failures occurs? */
-			if (!passed && link_retransmit_failure(l, l, &rc))
-				return rc;
-			passed = true;
-
+		} else if (less_eq(seqno, nacked + ngap)) {
+			/* First gap: check if repeated retrans failures? */
+			if (unlikely(seqno == acked + 1 &&
+				     link_retransmit_failure(l, r, rc))) {
+				/* Ignore this bc Gap ACKs if any */
+				kfree(this_ga);
+				this_ga = NULL;
+				break;
+			}
 			/* retransmit skb if unrestricted*/
 			if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
 				continue;
-			TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME;
+			TIPC_SKB_CB(skb)->nxt_retr = (is_uc) ?
+					TIPC_UC_RETR_TIME : TIPC_BC_RETR_LIM;
 			_skb = pskb_copy(skb, GFP_ATOMIC);
 			if (!_skb)
 				continue;
@@ -1516,25 +1585,51 @@ next_gap_ack:
 			_skb->priority = TC_PRIO_CONTROL;
 			__skb_queue_tail(xmitq, _skb);
 			l->stats.retransmitted++;
-			retransmitted = true;
+			*retransmitted = true;
 			/* Increase actual retrans counter & mark first time */
 			if (!TIPC_SKB_CB(skb)->retr_cnt++)
 				TIPC_SKB_CB(skb)->retr_stamp = jiffies;
 		} else {
 			/* retry with Gap ACK blocks if any */
-			if (!ga || n >= ga->gack_cnt)
+			if (n >= gack_cnt)
 				break;
-			acked = ntohs(ga->gacks[n].ack);
-			gap = ntohs(ga->gacks[n].gap);
+			nacked = ntohs(gacks[n].ack);
+			ngap = ntohs(gacks[n].gap);
 			n++;
 			goto next_gap_ack;
 		}
 	}
-	if (released || retransmitted)
-		tipc_link_update_cwin(l, released, retransmitted);
-	if (released)
-		tipc_link_advance_backlog(l, xmitq);
-	return 0;
+
+	/* Renew last Gap ACK blocks for bc if needed */
+	if (bc_has_acked) {
+		if (this_ga) {
+			kfree(last_ga);
+			r->last_ga = this_ga;
+			r->last_gap = gap;
+		} else if (last_ga) {
+			if (less(acked, start)) {
+				si--;
+				offset = start - acked - 1;
+			} else if (less(acked, end)) {
+				acked = end;
+			}
+			if (si < last_ga->bgack_cnt) {
+				last_ga->start_index = si;
+				r->last_gap = offset;
+			} else {
+				kfree(last_ga);
+				r->last_ga = NULL;
+				r->last_gap = 0;
+			}
+		} else {
+			r->last_gap = 0;
+		}
+		r->acked = acked;
+	} else {
+		kfree(this_ga);
+	}
+
+	return qlen - skb_queue_len(&l->transmq);
 }
 
 /* tipc_link_build_state_msg: prepare link state message for transmission
@@ -1651,7 +1746,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
 			kfree_skb(skb);
 			break;
 		}
-		released += tipc_link_release_pkts(l, msg_ack(hdr));
+		released += tipc_link_advance_transmq(l, l, msg_ack(hdr), 0,
+						      NULL, NULL, NULL, NULL);
 
 		/* Defer delivery if sequence gap */
 		if (unlikely(seqno != rcv_nxt)) {
@@ -1739,7 +1835,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 		msg_set_probe(hdr, probe);
 		msg_set_is_keepalive(hdr, probe || probe_reply);
 		if (l->peer_caps & TIPC_GAP_ACK_BLOCK)
-			glen = tipc_build_gap_ack_blks(l, data, rcvgap);
+			glen = tipc_build_gap_ack_blks(l, hdr);
 		tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id);
 		msg_set_size(hdr, INT_H_SIZE + glen + dlen);
 		skb_trim(skb, INT_H_SIZE + glen + dlen);
@@ -2027,20 +2123,19 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 {
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct tipc_gap_ack_blks *ga = NULL;
-	u16 rcvgap = 0;
-	u16 ack = msg_ack(hdr);
-	u16 gap = msg_seq_gap(hdr);
+	bool reply = msg_probe(hdr), retransmitted = false;
+	u16 dlen = msg_data_sz(hdr), glen = 0;
 	u16 peers_snd_nxt =  msg_next_sent(hdr);
 	u16 peers_tol = msg_link_tolerance(hdr);
 	u16 peers_prio = msg_linkprio(hdr);
+	u16 gap = msg_seq_gap(hdr);
+	u16 ack = msg_ack(hdr);
 	u16 rcv_nxt = l->rcv_nxt;
-	u16 dlen = msg_data_sz(hdr);
+	u16 rcvgap = 0;
 	int mtyp = msg_type(hdr);
-	bool reply = msg_probe(hdr);
-	u16 glen = 0;
-	void *data;
+	int rc = 0, released;
 	char *if_name;
-	int rc = 0;
+	void *data;
 
 	trace_tipc_proto_rcv(skb, false, l->name);
 	if (tipc_link_is_blocked(l) || !xmitq)
@@ -2137,13 +2232,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		}
 
 		/* Receive Gap ACK blocks from peer if any */
-		if (l->peer_caps & TIPC_GAP_ACK_BLOCK) {
-			ga = (struct tipc_gap_ack_blks *)data;
-			glen = ntohs(ga->len);
-			/* sanity check: if failed, ignore Gap ACK blocks */
-			if (glen != tipc_gap_ack_blks_sz(ga->gack_cnt))
-				ga = NULL;
-		}
+		glen = tipc_get_gap_ack_blks(&ga, l, hdr, true);
 
 		tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr,
 			     &l->mon_state, l->bearer_id);
@@ -2158,9 +2247,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 			tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
 						  rcvgap, 0, 0, xmitq);
 
-		rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
+		released = tipc_link_advance_transmq(l, l, ack, gap, ga, xmitq,
+						     &retransmitted, &rc);
 		if (gap)
 			l->stats.recv_nacks++;
+		if (released || retransmitted)
+			tipc_link_update_cwin(l, released, retransmitted);
+		if (released)
+			tipc_link_advance_backlog(l, xmitq);
 		if (unlikely(!skb_queue_empty(&l->wakeupq)))
 			link_prepare_wakeup(l);
 	}
@@ -2246,10 +2340,7 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
 int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
 			  struct sk_buff_head *xmitq)
 {
-	struct tipc_link *snd_l = l->bc_sndlink;
 	u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
-	u16 from = msg_bcast_ack(hdr) + 1;
-	u16 to = from + msg_bc_gap(hdr) - 1;
 	int rc = 0;
 
 	if (!link_is_up(l))
@@ -2271,8 +2362,6 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
 	if (more(peers_snd_nxt, l->rcv_nxt + l->window))
 		return rc;
 
-	rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq);
-
 	l->snd_nxt = peers_snd_nxt;
 	if (link_bc_rcv_gap(l))
 		rc |= TIPC_LINK_SND_STATE;
@@ -2307,38 +2396,27 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
 	return 0;
 }
 
-void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
-			  struct sk_buff_head *xmitq)
+int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
+			 struct tipc_gap_ack_blks *ga,
+			 struct sk_buff_head *xmitq)
 {
-	struct sk_buff *skb, *tmp;
-	struct tipc_link *snd_l = l->bc_sndlink;
+	struct tipc_link *l = r->bc_sndlink;
+	bool unused = false;
+	int rc = 0;
 
-	if (!link_is_up(l) || !l->bc_peer_is_up)
-		return;
+	if (!link_is_up(r) || !r->bc_peer_is_up)
+		return 0;
 
-	if (!more(acked, l->acked))
-		return;
+	if (less(acked, r->acked) || (acked == r->acked && !gap && !ga))
+		return 0;
 
-	trace_tipc_link_bc_ack(l, l->acked, acked, &snd_l->transmq);
-	/* Skip over packets peer has already acked */
-	skb_queue_walk(&snd_l->transmq, skb) {
-		if (more(buf_seqno(skb), l->acked))
-			break;
-	}
+	tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc);
 
-	/* Update/release the packets peer is acking now */
-	skb_queue_walk_from_safe(&snd_l->transmq, skb, tmp) {
-		if (more(buf_seqno(skb), acked))
-			break;
-		if (!--TIPC_SKB_CB(skb)->ackers) {
-			__skb_unlink(skb, &snd_l->transmq);
-			kfree_skb(skb);
-		}
-	}
-	l->acked = acked;
-	tipc_link_advance_backlog(snd_l, xmitq);
-	if (unlikely(!skb_queue_empty(&snd_l->wakeupq)))
-		link_prepare_wakeup(snd_l);
+	tipc_link_advance_backlog(l, xmitq);
+	if (unlikely(!skb_queue_empty(&l->wakeupq)))
+		link_prepare_wakeup(l);
+
+	return rc;
 }
 
 /* tipc_link_bc_nack_rcv(): receive broadcast nack message
@@ -2366,8 +2444,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
 		return 0;
 
 	if (dnode == tipc_own_addr(l->net)) {
-		tipc_link_bc_ack_rcv(l, acked, xmitq);
-		rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq);
+		rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq);
 		l->stats.recv_nacks++;
 		return rc;
 	}
diff --git a/net/tipc/link.h b/net/tipc/link.h
index d3c1c3fc1659..0a0fa7350722 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -143,8 +143,11 @@ int tipc_link_bc_peers(struct tipc_link *l);
 void tipc_link_set_mtu(struct tipc_link *l, int mtu);
 int tipc_link_mtu(struct tipc_link *l);
 int tipc_link_mss(struct tipc_link *l);
-void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
-			  struct sk_buff_head *xmitq);
+u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l,
+			  struct tipc_msg *hdr, bool uc);
+int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap,
+			 struct tipc_gap_ack_blks *ga,
+			 struct sk_buff_head *xmitq);
 void tipc_link_build_bc_sync_msg(struct tipc_link *l,
 				 struct sk_buff_head *xmitq);
 void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 871feadbbc19..ca5f8689a33b 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -160,20 +160,39 @@ struct tipc_gap_ack {
 
 /* struct tipc_gap_ack_blks
  * @len: actual length of the record
- * @gack_cnt: number of Gap ACK blocks in the record
+ * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast
+ *             ones)
+ * @start_index: starting index for "valid" broadcast Gap ACK blocks
+ * @bgack_cnt: number of Gap ACK blocks for broadcast in the record
  * @gacks: array of Gap ACK blocks
+ *
+ *  31                       16 15                        0
+ * +-------------+-------------+-------------+-------------+
+ * |  bgack_cnt  |  ugack_cnt  |            len            |
+ * +-------------+-------------+-------------+-------------+  -
+ * |            gap            |            ack            |   |
+ * +-------------+-------------+-------------+-------------+    > bc gacks
+ * :                           :                           :   |
+ * +-------------+-------------+-------------+-------------+  -
+ * |            gap            |            ack            |   |
+ * +-------------+-------------+-------------+-------------+    > uc gacks
+ * :                           :                           :   |
+ * +-------------+-------------+-------------+-------------+  -
  */
 struct tipc_gap_ack_blks {
 	__be16 len;
-	u8 gack_cnt;
-	u8 reserved;
+	union {
+		u8 ugack_cnt;
+		u8 start_index;
+	};
+	u8 bgack_cnt;
 	struct tipc_gap_ack gacks[];
 };
 
 #define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \
 				 sizeof(struct tipc_gap_ack) * (n))
 
-#define MAX_GAP_ACK_BLKS	32
+#define MAX_GAP_ACK_BLKS	128
 #define MAX_GAP_ACK_BLKS_SZ	tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS)
 
 static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 803a3a6d0f50..6a49b3eeaae9 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2071,10 +2071,16 @@ rcv:
 	le = &n->links[bearer_id];
 
 	/* Ensure broadcast reception is in synch with peer's send state */
-	if (unlikely(usr == LINK_PROTOCOL))
+	if (unlikely(usr == LINK_PROTOCOL)) {
+		if (unlikely(skb_linearize(skb))) {
+			tipc_node_put(n);
+			goto discard;
+		}
+		hdr = buf_msg(skb);
 		tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq);
-	else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack))
+	} else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) {
 		tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr);
+	}
 
 	/* Receive packet directly if conditions permit */
 	tipc_node_read_lock(n);
-- 
cgit v1.2.3


From c6ed7a5cc2d68c36287c09260dc211173e0447d7 Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 26 May 2020 16:38:35 +0700
Subject: tipc: add back link trace events

In the previous commit ("tipc: add Gap ACK blocks support for broadcast
link"), we have removed the following link trace events due to the code
changes:

- tipc_link_bc_ack
- tipc_link_retrans

This commit adds them back along with some minor changes to adapt to
the new code.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c  |  3 +++
 net/tipc/trace.h | 13 ++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index d29b9c531171..288c5670cfa5 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1504,6 +1504,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r,
 	bool is_uc = !link_is_bc_sndlink(l);
 	bool bc_has_acked = false;
 
+	trace_tipc_link_retrans(r, acked + 1, acked + gap, &l->transmq);
+
 	/* Determine Gap ACK blocks if any for the particular link */
 	if (ga && is_uc) {
 		/* Get the Gap ACKs, uc part */
@@ -2410,6 +2412,7 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
 	if (less(acked, r->acked) || (acked == r->acked && !gap && !ga))
 		return 0;
 
+	trace_tipc_link_bc_ack(r, acked, gap, &l->transmq);
 	tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc);
 
 	tipc_link_advance_backlog(l, xmitq);
diff --git a/net/tipc/trace.h b/net/tipc/trace.h
index 4d8e00483afc..e7535ab75255 100644
--- a/net/tipc/trace.h
+++ b/net/tipc/trace.h
@@ -299,8 +299,10 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class,
 		__entry->from = f;
 		__entry->to = t;
 		__entry->len = skb_queue_len(tq);
-		__entry->fseqno = msg_seqno(buf_msg(skb_peek(tq)));
-		__entry->lseqno = msg_seqno(buf_msg(skb_peek_tail(tq)));
+		__entry->fseqno = __entry->len ?
+				  msg_seqno(buf_msg(skb_peek(tq))) : 0;
+		__entry->lseqno = __entry->len ?
+				  msg_seqno(buf_msg(skb_peek_tail(tq))) : 0;
 	),
 
 	TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n",
@@ -308,15 +310,16 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class,
 		  __entry->len, __entry->fseqno, __entry->lseqno)
 );
 
-DEFINE_EVENT(tipc_link_transmq_class, tipc_link_retrans,
+DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans,
 	TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
-	TP_ARGS(r, f, t, tq)
+	TP_ARGS(r, f, t, tq),
+	TP_CONDITION(less_eq(f, t))
 );
 
 DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack,
 	TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
 	TP_ARGS(r, f, t, tq),
-	TP_printk("<%s> acked: [%u-%u] transmq: %u [%u-%u]\n",
+	TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n",
 		  __entry->name, __entry->from, __entry->to,
 		  __entry->len, __entry->fseqno, __entry->lseqno)
 );
-- 
cgit v1.2.3


From a91d55d162b86fb983b88f44296149752db7efbd Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 26 May 2020 16:38:36 +0700
Subject: tipc: enable broadcast retrans via unicast

In some environment, broadcast traffic is suppressed at high rate (i.e.
a kind of bandwidth limit setting). When it is applied, TIPC broadcast
can still run successfully. However, when it comes to a high load, some
packets will be dropped first and TIPC tries to retransmit them but the
packet retransmission is intentionally broadcast too, so making things
worse and not helpful at all.

This commit enables the broadcast retransmission via unicast which only
retransmits packets to the specific peer that has really reported a gap
i.e. not broadcasting to all nodes in the cluster, so will prevent from
being suppressed, and also reduce some overheads on the other peers due
to duplicates, finally improve the overall TIPC broadcast performance.

Note: the functionality can be turned on/off via the sysctl file:

echo 1 > /proc/sys/net/tipc/bc_retruni
echo 0 > /proc/sys/net/tipc/bc_retruni

Default is '0', i.e. the broadcast retransmission still works as usual.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c  | 11 ++++++++---
 net/tipc/bcast.h  |  4 +++-
 net/tipc/link.c   | 10 ++++++----
 net/tipc/link.h   |  3 ++-
 net/tipc/node.c   |  2 +-
 net/tipc/sysctl.c |  9 ++++++++-
 6 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 3ce690a96ee9..50a16f8bebd9 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -46,6 +46,7 @@
 #define BCLINK_WIN_MIN      32	/* bcast minimum link window size */
 
 const char tipc_bclink_name[] = "broadcast-link";
+unsigned long sysctl_tipc_bc_retruni __read_mostly;
 
 /**
  * struct tipc_bc_base - base structure for keeping broadcast send state
@@ -474,7 +475,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 	__skb_queue_head_init(&xmitq);
 
 	tipc_bcast_lock(net);
-	tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq);
+	tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq, NULL);
 	tipc_bcast_unlock(net);
 
 	tipc_bcbase_xmit(net, &xmitq);
@@ -489,7 +490,8 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
  * RCU is locked, no other locks set
  */
 int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-			struct tipc_msg *hdr)
+			struct tipc_msg *hdr,
+			struct sk_buff_head *retrq)
 {
 	struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
 	struct tipc_gap_ack_blks *ga;
@@ -503,8 +505,11 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 		tipc_link_bc_init_rcv(l, hdr);
 	} else if (!msg_bc_ack_invalid(hdr)) {
 		tipc_get_gap_ack_blks(&ga, l, hdr, false);
+		if (!sysctl_tipc_bc_retruni)
+			retrq = &xmitq;
 		rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr),
-					  msg_bc_gap(hdr), ga, &xmitq);
+					  msg_bc_gap(hdr), ga, &xmitq,
+					  retrq);
 		rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq);
 	}
 	tipc_bcast_unlock(net);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 9e847d9617d3..97d3cf9d3e4d 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -45,6 +45,7 @@ struct tipc_nl_msg;
 struct tipc_nlist;
 struct tipc_nitem;
 extern const char tipc_bclink_name[];
+extern unsigned long sysctl_tipc_bc_retruni;
 
 #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000)
 
@@ -93,7 +94,8 @@ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
 int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-			struct tipc_msg *hdr);
+			struct tipc_msg *hdr,
+			struct sk_buff_head *retrq);
 int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
 int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
 int tipc_bclink_reset_stats(struct net *net);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 288c5670cfa5..af352391e2ab 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -375,7 +375,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
 	snd_l->ackers--;
 	rcv_l->bc_peer_is_up = true;
 	rcv_l->state = LINK_ESTABLISHED;
-	tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq);
+	tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq, NULL);
 	trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!");
 	tipc_link_reset(rcv_l);
 	rcv_l->state = LINK_RESET;
@@ -2400,7 +2400,8 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
 
 int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
 			 struct tipc_gap_ack_blks *ga,
-			 struct sk_buff_head *xmitq)
+			 struct sk_buff_head *xmitq,
+			 struct sk_buff_head *retrq)
 {
 	struct tipc_link *l = r->bc_sndlink;
 	bool unused = false;
@@ -2413,7 +2414,7 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
 		return 0;
 
 	trace_tipc_link_bc_ack(r, acked, gap, &l->transmq);
-	tipc_link_advance_transmq(l, r, acked, gap, ga, xmitq, &unused, &rc);
+	tipc_link_advance_transmq(l, r, acked, gap, ga, retrq, &unused, &rc);
 
 	tipc_link_advance_backlog(l, xmitq);
 	if (unlikely(!skb_queue_empty(&l->wakeupq)))
@@ -2447,7 +2448,8 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
 		return 0;
 
 	if (dnode == tipc_own_addr(l->net)) {
-		rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq);
+		rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq,
+					  xmitq);
 		l->stats.recv_nacks++;
 		return rc;
 	}
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 0a0fa7350722..4d0768cf91d5 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -147,7 +147,8 @@ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l,
 			  struct tipc_msg *hdr, bool uc);
 int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap,
 			 struct tipc_gap_ack_blks *ga,
-			 struct sk_buff_head *xmitq);
+			 struct sk_buff_head *xmitq,
+			 struct sk_buff_head *retrq);
 void tipc_link_build_bc_sync_msg(struct tipc_link *l,
 				 struct sk_buff_head *xmitq);
 void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 6a49b3eeaae9..548207fdec15 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1772,7 +1772,7 @@ static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
 	struct tipc_link *ucl;
 	int rc;
 
-	rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr);
+	rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq);
 
 	if (rc & TIPC_LINK_DOWN_EVT) {
 		tipc_node_reset_links(n);
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 58ab3d6dcdce..97a6264a2993 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -36,7 +36,7 @@
 #include "core.h"
 #include "trace.h"
 #include "crypto.h"
-
+#include "bcast.h"
 #include <linux/sysctl.h>
 
 static struct ctl_table_header *tipc_ctl_hdr;
@@ -75,6 +75,13 @@ static struct ctl_table tipc_table[] = {
 		.extra1         = SYSCTL_ONE,
 	},
 #endif
+	{
+		.procname	= "bc_retruni",
+		.data		= &sysctl_tipc_bc_retruni,
+		.maxlen		= sizeof(sysctl_tipc_bc_retruni),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{}
 };
 
-- 
cgit v1.2.3


From 03b6fefd9bb4844c75faeb10df8496794e2fd5da Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 26 May 2020 16:38:37 +0700
Subject: tipc: add support for broadcast rcv stats dumping

This commit enables dumping the statistics of a broadcast-receiver link
like the traditional 'broadcast-link' one (which is for broadcast-
sender). The link dumping can be triggered via netlink (e.g. the
iproute2/tipc tool) by the link flag - 'TIPC_NLA_LINK_BROADCAST' as the
indicator.

The name of a broadcast-receiver link of a specific peer will be in the
format: 'broadcast-link:<peer-id>'.

For example:

Link <broadcast-link:1001002>
  Window:50 packets
  RX packets:7841 fragments:2408/440 bundles:0/0
  TX packets:0 fragments:0/0 bundles:0/0
  RX naks:0 defs:124 dups:0
  TX naks:21 acks:0 retrans:0
  Congestion link:0  Send queue max:0 avg:0

In addition, the broadcast-receiver link statistics can be reset in the
usual way via netlink by specifying that link name in command.

Note: the 'tipc_link_name_ext()' is removed because the link name can
now be retrieved simply via the 'l->name'.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c   |  6 ++---
 net/tipc/bcast.h   |  5 +++--
 net/tipc/link.c    | 65 +++++++++++++++++++++++++++---------------------------
 net/tipc/link.h    |  3 +--
 net/tipc/msg.c     |  9 ++++----
 net/tipc/msg.h     |  2 +-
 net/tipc/netlink.c |  2 +-
 net/tipc/node.c    | 61 +++++++++++++++++++++++++++++++++++++++++++-------
 net/tipc/trace.h   |  4 ++--
 9 files changed, 101 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 50a16f8bebd9..383f87bc1061 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -563,10 +563,8 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
 		tipc_sk_rcv(net, inputq);
 }
 
-int tipc_bclink_reset_stats(struct net *net)
+int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l)
 {
-	struct tipc_link *l = tipc_bc_sndlink(net);
-
 	if (!l)
 		return -ENOPROTOOPT;
 
@@ -694,7 +692,7 @@ int tipc_bcast_init(struct net *net)
 	tn->bcbase = bb;
 	spin_lock_init(&tipc_net(net)->bclock);
 
-	if (!tipc_link_bc_create(net, 0, 0,
+	if (!tipc_link_bc_create(net, 0, 0, NULL,
 				 FB_MTU,
 				 BCLINK_WIN_DEFAULT,
 				 BCLINK_WIN_DEFAULT,
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 97d3cf9d3e4d..4240c95188b1 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -96,9 +96,10 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr,
 			struct sk_buff_head *retrq);
-int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg,
+			struct tipc_link *bcl);
 int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
-int tipc_bclink_reset_stats(struct net *net);
+int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l);
 
 u32 tipc_bcast_get_broadcast_mode(struct net *net);
 u32 tipc_bcast_get_broadcast_ratio(struct net *net);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index af352391e2ab..ee3b8d0576b8 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -539,7 +539,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
  *
  * Returns true if link was created, otherwise false
  */
-bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
+bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id,
 			 int mtu, u32 min_win, u32 max_win, u16 peer_caps,
 			 struct sk_buff_head *inputq,
 			 struct sk_buff_head *namedq,
@@ -554,7 +554,18 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 		return false;
 
 	l = *link;
-	strcpy(l->name, tipc_bclink_name);
+	if (peer_id) {
+		char peer_str[NODE_ID_STR_LEN] = {0,};
+
+		tipc_nodeid2string(peer_str, peer_id);
+		if (strlen(peer_str) > 16)
+			sprintf(peer_str, "%x", peer);
+		/* Broadcast receiver link name: "broadcast-link:<peer>" */
+		snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name,
+			 peer_str);
+	} else {
+		strcpy(l->name, tipc_bclink_name);
+	}
 	trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!");
 	tipc_link_reset(l);
 	l->state = LINK_RESET;
@@ -1412,11 +1423,8 @@ static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga,
 			gacks[n].ack = htons(expect - 1);
 			gacks[n].gap = htons(seqno - expect);
 			if (++n >= MAX_GAP_ACK_BLKS / 2) {
-				char buf[TIPC_MAX_LINK_NAME];
-
 				pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n",
-						    tipc_link_name_ext(l, buf),
-						    n,
+						    l->name, n,
 						    skb_queue_len(&l->deferdq));
 				return n;
 			}
@@ -1587,6 +1595,8 @@ release:
 			_skb->priority = TC_PRIO_CONTROL;
 			__skb_queue_tail(xmitq, _skb);
 			l->stats.retransmitted++;
+			if (!is_uc)
+				r->stats.retransmitted++;
 			*retransmitted = true;
 			/* Increase actual retrans counter & mark first time */
 			if (!TIPC_SKB_CB(skb)->retr_cnt++)
@@ -1753,7 +1763,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
 
 		/* Defer delivery if sequence gap */
 		if (unlikely(seqno != rcv_nxt)) {
-			__tipc_skb_queue_sorted(defq, seqno, skb);
+			if (!__tipc_skb_queue_sorted(defq, seqno, skb))
+				l->stats.duplicates++;
 			rc |= tipc_link_build_nack_msg(l, xmitq);
 			break;
 		}
@@ -1787,15 +1798,15 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 				      int tolerance, int priority,
 				      struct sk_buff_head *xmitq)
 {
+	struct tipc_mon_state *mstate = &l->mon_state;
+	struct sk_buff_head *dfq = &l->deferdq;
 	struct tipc_link *bcl = l->bc_rcvlink;
-	struct sk_buff *skb;
 	struct tipc_msg *hdr;
-	struct sk_buff_head *dfq = &l->deferdq;
+	struct sk_buff *skb;
 	bool node_up = link_is_up(bcl);
-	struct tipc_mon_state *mstate = &l->mon_state;
+	u16 glen = 0, bc_rcvgap = 0;
 	int dlen = 0;
 	void *data;
-	u16 glen = 0;
 
 	/* Don't send protocol message during reset or link failover */
 	if (tipc_link_is_blocked(l))
@@ -1833,7 +1844,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 		if (l->peer_caps & TIPC_LINK_PROTO_SEQNO)
 			msg_set_seqno(hdr, l->snd_nxt_state++);
 		msg_set_seq_gap(hdr, rcvgap);
-		msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
+		bc_rcvgap = link_bc_rcv_gap(bcl);
+		msg_set_bc_gap(hdr, bc_rcvgap);
 		msg_set_probe(hdr, probe);
 		msg_set_is_keepalive(hdr, probe || probe_reply);
 		if (l->peer_caps & TIPC_GAP_ACK_BLOCK)
@@ -1858,6 +1870,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
 		l->stats.sent_probes++;
 	if (rcvgap)
 		l->stats.sent_nacks++;
+	if (bc_rcvgap)
+		bcl->stats.sent_nacks++;
 	skb->priority = TC_PRIO_CONTROL;
 	__skb_queue_tail(xmitq, skb);
 	trace_tipc_proto_build(skb, false, l->name);
@@ -2358,8 +2372,6 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
 	if (!l->bc_peer_is_up)
 		return rc;
 
-	l->stats.recv_nacks++;
-
 	/* Ignore if peers_snd_nxt goes beyond receive window */
 	if (more(peers_snd_nxt, l->rcv_nxt + l->window))
 		return rc;
@@ -2410,6 +2422,11 @@ int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
 	if (!link_is_up(r) || !r->bc_peer_is_up)
 		return 0;
 
+	if (gap) {
+		l->stats.recv_nacks++;
+		r->stats.recv_nacks++;
+	}
+
 	if (less(acked, r->acked) || (acked == r->acked && !gap && !ga))
 		return 0;
 
@@ -2721,16 +2738,15 @@ msg_full:
 	return -EMSGSIZE;
 }
 
-int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg,
+			struct tipc_link *bcl)
 {
 	int err;
 	void *hdr;
 	struct nlattr *attrs;
 	struct nlattr *prop;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	u32 bc_mode = tipc_bcast_get_broadcast_mode(net);
 	u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net);
-	struct tipc_link *bcl = tn->bcl;
 
 	if (!bcl)
 		return 0;
@@ -2817,21 +2833,6 @@ void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
 	l->abort_limit = limit;
 }
 
-char *tipc_link_name_ext(struct tipc_link *l, char *buf)
-{
-	if (!l)
-		scnprintf(buf, TIPC_MAX_LINK_NAME, "null");
-	else if (link_is_bc_sndlink(l))
-		scnprintf(buf, TIPC_MAX_LINK_NAME, "broadcast-sender");
-	else if (link_is_bc_rcvlink(l))
-		scnprintf(buf, TIPC_MAX_LINK_NAME,
-			  "broadcast-receiver, peer %x", l->addr);
-	else
-		memcpy(buf, l->name, TIPC_MAX_LINK_NAME);
-
-	return buf;
-}
-
 /**
  * tipc_link_dump - dump TIPC link data
  * @l: tipc link to be dumped
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 4d0768cf91d5..fc07232c9a12 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -80,7 +80,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      struct sk_buff_head *inputq,
 		      struct sk_buff_head *namedq,
 		      struct tipc_link **link);
-bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
+bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id,
 			 int mtu, u32 min_win, u32 max_win, u16 peer_caps,
 			 struct sk_buff_head *inputq,
 			 struct sk_buff_head *namedq,
@@ -111,7 +111,6 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l);
 u16 tipc_link_acked(struct tipc_link *l);
 u32 tipc_link_id(struct tipc_link *l);
 char *tipc_link_name(struct tipc_link *l);
-char *tipc_link_name_ext(struct tipc_link *l, char *buf);
 u32 tipc_link_state(struct tipc_link *l);
 char tipc_link_plane(struct tipc_link *l);
 int tipc_link_prio(struct tipc_link *l);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 4d0e0bdd997b..c69fb99163fc 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -825,19 +825,19 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
  * @seqno: sequence number of buffer to add
  * @skb: buffer to add
  */
-void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
+bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
 			     struct sk_buff *skb)
 {
 	struct sk_buff *_skb, *tmp;
 
 	if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) {
 		__skb_queue_head(list, skb);
-		return;
+		return true;
 	}
 
 	if (more(seqno, buf_seqno(skb_peek_tail(list)))) {
 		__skb_queue_tail(list, skb);
-		return;
+		return true;
 	}
 
 	skb_queue_walk_safe(list, _skb, tmp) {
@@ -846,9 +846,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
 		if (seqno == buf_seqno(_skb))
 			break;
 		__skb_queue_before(list, _skb, skb);
-		return;
+		return true;
 	}
 	kfree_skb(skb);
+	return false;
 }
 
 void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb,
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index ca5f8689a33b..cd4281779468 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -1145,7 +1145,7 @@ bool tipc_msg_assemble(struct sk_buff_head *list);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
 bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
 			struct sk_buff_head *cpy);
-void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
+bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
 			     struct sk_buff *skb);
 bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy);
 
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index bb9862410e68..c4aee6247d55 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -188,7 +188,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 	},
 	{
 		.cmd	= TIPC_NL_LINK_GET,
-		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.validate = GENL_DONT_VALIDATE_STRICT,
 		.doit   = tipc_nl_node_get_link,
 		.dumpit	= tipc_nl_node_dump_link,
 	},
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 548207fdec15..0312fb181d94 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1138,7 +1138,7 @@ void tipc_node_check_dest(struct net *net, u32 addr,
 	if (unlikely(!n->bc_entry.link)) {
 		snd_l = tipc_bc_sndlink(net);
 		if (!tipc_link_bc_create(net, tipc_own_addr(net),
-					 addr, U16_MAX,
+					 addr, peer_id, U16_MAX,
 					 tipc_link_min_win(snd_l),
 					 tipc_link_max_win(snd_l),
 					 n->capabilities,
@@ -2435,7 +2435,7 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info)
 		return -ENOMEM;
 
 	if (strcmp(name, tipc_bclink_name) == 0) {
-		err = tipc_nl_add_bc_link(net, &msg);
+		err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl);
 		if (err)
 			goto err_free;
 	} else {
@@ -2479,6 +2479,7 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info)
 	struct tipc_node *node;
 	struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
 	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_link_entry *le;
 
 	if (!info->attrs[TIPC_NLA_LINK])
@@ -2495,11 +2496,26 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info)
 
 	link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
 
-	if (strcmp(link_name, tipc_bclink_name) == 0) {
-		err = tipc_bclink_reset_stats(net);
+	err = -EINVAL;
+	if (!strcmp(link_name, tipc_bclink_name)) {
+		err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net));
 		if (err)
 			return err;
 		return 0;
+	} else if (strstr(link_name, tipc_bclink_name)) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(node, &tn->node_list, list) {
+			tipc_node_read_lock(node);
+			link = node->bc_entry.link;
+			if (link && !strcmp(link_name, tipc_link_name(link))) {
+				err = tipc_bclink_reset_stats(net, link);
+				tipc_node_read_unlock(node);
+				break;
+			}
+			tipc_node_read_unlock(node);
+		}
+		rcu_read_unlock();
+		return err;
 	}
 
 	node = tipc_node_find_by_name(net, link_name, &bearer_id);
@@ -2523,7 +2539,8 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info)
 
 /* Caller should hold node lock  */
 static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
-				    struct tipc_node *node, u32 *prev_link)
+				    struct tipc_node *node, u32 *prev_link,
+				    bool bc_link)
 {
 	u32 i;
 	int err;
@@ -2539,6 +2556,14 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
 		if (err)
 			return err;
 	}
+
+	if (bc_link) {
+		*prev_link = i;
+		err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link);
+		if (err)
+			return err;
+	}
+
 	*prev_link = 0;
 
 	return 0;
@@ -2547,17 +2572,36 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
 int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct nlattr **attrs = genl_dumpit_info(cb)->attrs;
+	struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_node *node;
 	struct tipc_nl_msg msg;
 	u32 prev_node = cb->args[0];
 	u32 prev_link = cb->args[1];
 	int done = cb->args[2];
+	bool bc_link = cb->args[3];
 	int err;
 
 	if (done)
 		return 0;
 
+	if (!prev_node) {
+		/* Check if broadcast-receiver links dumping is needed */
+		if (attrs && attrs[TIPC_NLA_LINK]) {
+			err = nla_parse_nested_deprecated(link,
+							  TIPC_NLA_LINK_MAX,
+							  attrs[TIPC_NLA_LINK],
+							  tipc_nl_link_policy,
+							  NULL);
+			if (unlikely(err))
+				return err;
+			if (unlikely(!link[TIPC_NLA_LINK_BROADCAST]))
+				return -EINVAL;
+			bc_link = true;
+		}
+	}
+
 	msg.skb = skb;
 	msg.portid = NETLINK_CB(cb->skb).portid;
 	msg.seq = cb->nlh->nlmsg_seq;
@@ -2581,7 +2625,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb)
 						 list) {
 			tipc_node_read_lock(node);
 			err = __tipc_nl_add_node_links(net, &msg, node,
-						       &prev_link);
+						       &prev_link, bc_link);
 			tipc_node_read_unlock(node);
 			if (err)
 				goto out;
@@ -2589,14 +2633,14 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb)
 			prev_node = node->addr;
 		}
 	} else {
-		err = tipc_nl_add_bc_link(net, &msg);
+		err = tipc_nl_add_bc_link(net, &msg, tn->bcl);
 		if (err)
 			goto out;
 
 		list_for_each_entry_rcu(node, &tn->node_list, list) {
 			tipc_node_read_lock(node);
 			err = __tipc_nl_add_node_links(net, &msg, node,
-						       &prev_link);
+						       &prev_link, bc_link);
 			tipc_node_read_unlock(node);
 			if (err)
 				goto out;
@@ -2611,6 +2655,7 @@ out:
 	cb->args[0] = prev_node;
 	cb->args[1] = prev_link;
 	cb->args[2] = done;
+	cb->args[3] = bc_link;
 
 	return skb->len;
 }
diff --git a/net/tipc/trace.h b/net/tipc/trace.h
index e7535ab75255..04af83f0500c 100644
--- a/net/tipc/trace.h
+++ b/net/tipc/trace.h
@@ -255,7 +255,7 @@ DECLARE_EVENT_CLASS(tipc_link_class,
 
 	TP_fast_assign(
 		__assign_str(header, header);
-		tipc_link_name_ext(l, __entry->name);
+		memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME);
 		tipc_link_dump(l, dqueues, __get_str(buf));
 	),
 
@@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class,
 	),
 
 	TP_fast_assign(
-		tipc_link_name_ext(r, __entry->name);
+		memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME);
 		__entry->from = f;
 		__entry->to = t;
 		__entry->len = skb_queue_len(tq);
-- 
cgit v1.2.3


From 0a3e060f340dbe232ffa290c40f879b7f7db595b Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 26 May 2020 16:38:38 +0700
Subject: tipc: add test for Nagle algorithm effectiveness

When streaming in Nagle mode, we try to bundle small messages from user
as many as possible if there is one outstanding buffer, i.e. not ACK-ed
by the receiving side, which helps boost up the overall throughput. So,
the algorithm's effectiveness really depends on when Nagle ACK comes or
what the specific network latency (RTT) is, compared to the user's
message sending rate.

In a bad case, the user's sending rate is low or the network latency is
small, there will not be many bundles, so making a Nagle ACK or waiting
for it is not meaningful.
For example: a user sends its messages every 100ms and the RTT is 50ms,
then for each messages, we require one Nagle ACK but then there is only
one user message sent without any bundles.

In a better case, even if we have a few bundles (e.g. the RTT = 300ms),
but now the user sends messages in medium size, then there will not be
any difference at all, that says 3 x 1000-byte data messages if bundled
will still result in 3 bundles with MTU = 1500.

When Nagle is ineffective, the delay in user message sending is clearly
wasted instead of sending directly.

Besides, adding Nagle ACKs will consume some processor load on both the
sending and receiving sides.

This commit adds a test on the effectiveness of the Nagle algorithm for
an individual connection in the network on which it actually runs.
Particularly, upon receipt of a Nagle ACK we will compare the number of
bundles in the backlog queue to the number of user messages which would
be sent directly without Nagle. If the ratio is good (e.g. >= 2), Nagle
mode will be kept for further message sending. Otherwise, we will leave
Nagle and put a 'penalty' on the connection, so it will have to spend
more 'one-way' messages before being able to re-enter Nagle.

In addition, the 'ack-required' bit is only set when really needed that
the number of Nagle ACKs will be reduced during Nagle mode.

Testing with benchmark showed that with the patch, there was not much
difference in throughput for small messages since the tool continuously
sends messages without a break, so Nagle would still take in effect.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/msg.c    |  3 ---
 net/tipc/msg.h    | 14 ++++++++++--
 net/tipc/socket.c | 64 ++++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 64 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index c69fb99163fc..23809039dda1 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -235,9 +235,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
 			msg_set_size(hdr, MIN_H_SIZE);
 			__skb_queue_tail(txq, skb);
 			total += 1;
-			if (prev)
-				msg_set_ack_required(buf_msg(prev), 0);
-			msg_set_ack_required(hdr, 1);
 		}
 		hdr = buf_msg(skb);
 		curr = msg_blocks(hdr);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index cd4281779468..58660d56bc83 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -340,9 +340,19 @@ static inline int msg_ack_required(struct tipc_msg *m)
 	return msg_bits(m, 0, 18, 1);
 }
 
-static inline void msg_set_ack_required(struct tipc_msg *m, u32 d)
+static inline void msg_set_ack_required(struct tipc_msg *m)
 {
-	msg_set_bits(m, 0, 18, 1, d);
+	msg_set_bits(m, 0, 18, 1, 1);
+}
+
+static inline int msg_nagle_ack(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_nagle_ack(struct tipc_msg *m)
+{
+	msg_set_bits(m, 0, 18, 1, 1);
 }
 
 static inline bool msg_is_rcast(struct tipc_msg *m)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e370ad0edd76..d6b67d07d22e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -48,6 +48,8 @@
 #include "group.h"
 #include "trace.h"
 
+#define NAGLE_START_INIT	4
+#define NAGLE_START_MAX		1024
 #define CONN_TIMEOUT_DEFAULT    8000    /* default connect timeout = 8s */
 #define CONN_PROBING_INTV	msecs_to_jiffies(3600000)  /* [ms] => 1 h */
 #define TIPC_FWD_MSG		1
@@ -119,7 +121,10 @@ struct tipc_sock {
 	struct rcu_head rcu;
 	struct tipc_group *group;
 	u32 oneway;
+	u32 nagle_start;
 	u16 snd_backlog;
+	u16 msg_acc;
+	u16 pkt_cnt;
 	bool expect_ack;
 	bool nodelay;
 	bool group_is_open;
@@ -143,7 +148,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
 static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
 static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
-static void tipc_sk_push_backlog(struct tipc_sock *tsk);
+static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack);
 
 static const struct proto_ops packet_ops;
 static const struct proto_ops stream_ops;
@@ -474,6 +479,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	tsk = tipc_sk(sk);
 	tsk->max_pkt = MAX_PKT_DEFAULT;
 	tsk->maxnagle = 0;
+	tsk->nagle_start = NAGLE_START_INIT;
 	INIT_LIST_HEAD(&tsk->publications);
 	INIT_LIST_HEAD(&tsk->cong_links);
 	msg = &tsk->phdr;
@@ -541,7 +547,7 @@ static void __tipc_shutdown(struct socket *sock, int error)
 					    !tsk_conn_cong(tsk)));
 
 	/* Push out delayed messages if in Nagle mode */
-	tipc_sk_push_backlog(tsk);
+	tipc_sk_push_backlog(tsk, false);
 	/* Remove pending SYN */
 	__skb_queue_purge(&sk->sk_write_queue);
 
@@ -1252,14 +1258,37 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 /* tipc_sk_push_backlog(): send accumulated buffers in socket write queue
  *                         when socket is in Nagle mode
  */
-static void tipc_sk_push_backlog(struct tipc_sock *tsk)
+static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack)
 {
 	struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
+	struct sk_buff *skb = skb_peek_tail(txq);
 	struct net *net = sock_net(&tsk->sk);
 	u32 dnode = tsk_peer_node(tsk);
-	struct sk_buff *skb = skb_peek(txq);
 	int rc;
 
+	if (nagle_ack) {
+		tsk->pkt_cnt += skb_queue_len(txq);
+		if (!tsk->pkt_cnt || tsk->msg_acc / tsk->pkt_cnt < 2) {
+			tsk->oneway = 0;
+			if (tsk->nagle_start < NAGLE_START_MAX)
+				tsk->nagle_start *= 2;
+			tsk->expect_ack = false;
+			pr_debug("tsk %10u: bad nagle %u -> %u, next start %u!\n",
+				 tsk->portid, tsk->msg_acc, tsk->pkt_cnt,
+				 tsk->nagle_start);
+		} else {
+			tsk->nagle_start = NAGLE_START_INIT;
+			if (skb) {
+				msg_set_ack_required(buf_msg(skb));
+				tsk->expect_ack = true;
+			} else {
+				tsk->expect_ack = false;
+			}
+		}
+		tsk->msg_acc = 0;
+		tsk->pkt_cnt = 0;
+	}
+
 	if (!skb || tsk->cong_link_cnt)
 		return;
 
@@ -1267,9 +1296,10 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk)
 	if (msg_is_syn(buf_msg(skb)))
 		return;
 
+	if (tsk->msg_acc)
+		tsk->pkt_cnt += skb_queue_len(txq);
 	tsk->snt_unacked += tsk->snd_backlog;
 	tsk->snd_backlog = 0;
-	tsk->expect_ack = true;
 	rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
 	if (rc == -ELINKCONG)
 		tsk->cong_link_cnt = 1;
@@ -1322,8 +1352,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
 		return;
 	} else if (mtyp == CONN_ACK) {
 		was_cong = tsk_conn_cong(tsk);
-		tsk->expect_ack = false;
-		tipc_sk_push_backlog(tsk);
+		tipc_sk_push_backlog(tsk, msg_nagle_ack(hdr));
 		tsk->snt_unacked -= msg_conn_ack(hdr);
 		if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
 			tsk->snd_win = msg_adv_win(hdr);
@@ -1516,6 +1545,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
+	struct sk_buff *skb;
 	u32 dnode = tsk_peer_node(tsk);
 	int maxnagle = tsk->maxnagle;
 	int maxpkt = tsk->max_pkt;
@@ -1544,17 +1574,25 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 			break;
 		send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
 		blocks = tsk->snd_backlog;
-		if (tsk->oneway++ >= 4 && send <= maxnagle) {
+		if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) {
 			rc = tipc_msg_append(hdr, m, send, maxnagle, txq);
 			if (unlikely(rc < 0))
 				break;
 			blocks += rc;
+			tsk->msg_acc++;
 			if (blocks <= 64 && tsk->expect_ack) {
 				tsk->snd_backlog = blocks;
 				sent += send;
 				break;
+			} else if (blocks > 64) {
+				tsk->pkt_cnt += skb_queue_len(txq);
+			} else {
+				skb = skb_peek_tail(txq);
+				msg_set_ack_required(buf_msg(skb));
+				tsk->expect_ack = true;
+				tsk->msg_acc = 0;
+				tsk->pkt_cnt = 0;
 			}
-			tsk->expect_ack = true;
 		} else {
 			rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq);
 			if (unlikely(rc != send))
@@ -2091,7 +2129,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		smp_wmb();
 		tsk->cong_link_cnt--;
 		wakeup = true;
-		tipc_sk_push_backlog(tsk);
+		tipc_sk_push_backlog(tsk, false);
 		break;
 	case GROUP_PROTOCOL:
 		tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq);
@@ -2180,7 +2218,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb,
 		return false;
 	case TIPC_ESTABLISHED:
 		if (!skb_queue_empty(&sk->sk_write_queue))
-			tipc_sk_push_backlog(tsk);
+			tipc_sk_push_backlog(tsk, false);
 		/* Accept only connection-based messages sent by peer */
 		if (likely(con_msg && !err && pport == oport &&
 			   pnode == onode)) {
@@ -2188,8 +2226,10 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb,
 				struct sk_buff *skb;
 
 				skb = tipc_sk_build_ack(tsk);
-				if (skb)
+				if (skb) {
+					msg_set_nagle_ack(buf_msg(skb));
 					__skb_queue_tail(xmitq, skb);
+				}
 			}
 			return true;
 		}
-- 
cgit v1.2.3


From 58cff782cc55eb755826c649976aea9f5f8b3086 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 26 May 2020 14:29:00 +0200
Subject: flow_dissector: Parse multiple MPLS Label Stack Entries

The current MPLS dissector only parses the first MPLS Label Stack
Entry (second LSE can be parsed too, but only to set a key_id).

This patch adds the possibility to parse several LSEs by making
__skb_flow_dissect_mpls() return FLOW_DISSECT_RET_PROTO_AGAIN as long
as the Bottom Of Stack bit hasn't been seen, up to a maximum of
FLOW_DIS_MPLS_MAX entries.

FLOW_DIS_MPLS_MAX is arbitrarily set to 7. This should be enough for
many practical purposes, without wasting too much space.

To record the parsed values, flow_dissector_key_mpls is modified to
store an array of stack entries, instead of just the values of the
first one. A bit field, "used_lses", is also added to keep track of
the LSEs that have been set. The objective is to avoid defining a
new FLOW_DISSECTOR_KEY_MPLS_XX for each level of the MPLS stack.

TC flower is adapted for the new struct flow_dissector_key_mpls layout.
Matching on several MPLS Label Stack Entries will be added in the next
patch.

The NFP and MLX5 drivers are also adapted: nfp_flower_compile_mac() and
mlx5's parse_tunnel() now verify that the rule only uses the first LSE
and fail if it doesn't.

Finally, the behaviour of the FLOW_DISSECTOR_KEY_MPLS_ENTROPY key is
slightly modified. Instead of recording the first Entropy Label, it
now records the last one. This shouldn't have any consequences since
there doesn't seem to have any user of FLOW_DISSECTOR_KEY_MPLS_ENTROPY
in the tree. We'd probably better do a hash of all parsed MPLS labels
instead (excluding reserved labels) anyway. That'd give better entropy
and would probably also simplify the code. But that's not the purpose
of this patch, so I'm keeping that as a future possible improvement.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../mellanox/mlx5/core/en/tc_tun_mplsoudp.c        | 27 +++++++----
 drivers/net/ethernet/netronome/nfp/flower/match.c  | 42 ++++++++++++-----
 include/net/flow_dissector.h                       | 14 +++++-
 net/core/flow_dissector.c                          | 49 +++++++++++++-------
 net/sched/cls_flower.c                             | 52 +++++++++++++++-------
 5 files changed, 132 insertions(+), 52 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c
index 98ee62e427d2..b4a3c96d34fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c
@@ -101,25 +101,36 @@ static int parse_tunnel(struct mlx5e_priv *priv,
 
 	flow_rule_match_mpls(rule, &match);
 
+	/* Only support matching the first LSE */
+	if (match.mask->used_lses != 1)
+		return -EOPNOTSUPP;
+
 	MLX5_SET(fte_match_set_misc2, misc2_c,
-		 outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label);
+		 outer_first_mpls_over_udp.mpls_label,
+		 match.mask->ls[0].mpls_label);
 	MLX5_SET(fte_match_set_misc2, misc2_v,
-		 outer_first_mpls_over_udp.mpls_label, match.key->mpls_label);
+		 outer_first_mpls_over_udp.mpls_label,
+		 match.key->ls[0].mpls_label);
 
 	MLX5_SET(fte_match_set_misc2, misc2_c,
-		 outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc);
+		 outer_first_mpls_over_udp.mpls_exp,
+		 match.mask->ls[0].mpls_tc);
 	MLX5_SET(fte_match_set_misc2, misc2_v,
-		 outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc);
+		 outer_first_mpls_over_udp.mpls_exp, match.key->ls[0].mpls_tc);
 
 	MLX5_SET(fte_match_set_misc2, misc2_c,
-		 outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos);
+		 outer_first_mpls_over_udp.mpls_s_bos,
+		 match.mask->ls[0].mpls_bos);
 	MLX5_SET(fte_match_set_misc2, misc2_v,
-		 outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos);
+		 outer_first_mpls_over_udp.mpls_s_bos,
+		 match.key->ls[0].mpls_bos);
 
 	MLX5_SET(fte_match_set_misc2, misc2_c,
-		 outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl);
+		 outer_first_mpls_over_udp.mpls_ttl,
+		 match.mask->ls[0].mpls_ttl);
 	MLX5_SET(fte_match_set_misc2, misc2_v,
-		 outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl);
+		 outer_first_mpls_over_udp.mpls_ttl,
+		 match.key->ls[0].mpls_ttl);
 	spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c
index 546bc01d507d..f7f01e2e3dce 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -74,9 +74,10 @@ nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port,
 	return 0;
 }
 
-static void
+static int
 nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext,
-		       struct nfp_flower_mac_mpls *msk, struct flow_rule *rule)
+		       struct nfp_flower_mac_mpls *msk, struct flow_rule *rule,
+		       struct netlink_ext_ack *extack)
 {
 	memset(ext, 0, sizeof(struct nfp_flower_mac_mpls));
 	memset(msk, 0, sizeof(struct nfp_flower_mac_mpls));
@@ -97,14 +98,28 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext,
 		u32 t_mpls;
 
 		flow_rule_match_mpls(rule, &match);
-		t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.key->mpls_label) |
-			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.key->mpls_tc) |
-			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.key->mpls_bos) |
+
+		/* Only support matching the first LSE */
+		if (match.mask->used_lses != 1) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "unsupported offload: invalid LSE depth for MPLS match offload");
+			return -EOPNOTSUPP;
+		}
+
+		t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB,
+				    match.key->ls[0].mpls_label) |
+			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC,
+				    match.key->ls[0].mpls_tc) |
+			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS,
+				    match.key->ls[0].mpls_bos) |
 			 NFP_FLOWER_MASK_MPLS_Q;
 		ext->mpls_lse = cpu_to_be32(t_mpls);
-		t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.mask->mpls_label) |
-			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.mask->mpls_tc) |
-			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.mask->mpls_bos) |
+		t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB,
+				    match.mask->ls[0].mpls_label) |
+			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC,
+				    match.mask->ls[0].mpls_tc) |
+			 FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS,
+				    match.mask->ls[0].mpls_bos) |
 			 NFP_FLOWER_MASK_MPLS_Q;
 		msk->mpls_lse = cpu_to_be32(t_mpls);
 	} else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
@@ -121,6 +136,8 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext,
 			msk->mpls_lse = cpu_to_be32(NFP_FLOWER_MASK_MPLS_Q);
 		}
 	}
+
+	return 0;
 }
 
 static void
@@ -461,9 +478,12 @@ int nfp_flower_compile_flow_match(struct nfp_app *app,
 	msk += sizeof(struct nfp_flower_in_port);
 
 	if (NFP_FLOWER_LAYER_MAC & key_ls->key_layer) {
-		nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext,
-				       (struct nfp_flower_mac_mpls *)msk,
-				       rule);
+		err = nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext,
+					     (struct nfp_flower_mac_mpls *)msk,
+					     rule, extack);
+		if (err)
+			return err;
+
 		ext += sizeof(struct nfp_flower_mac_mpls);
 		msk += sizeof(struct nfp_flower_mac_mpls);
 	}
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 628383915827..4fb1a69c6ecf 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -59,13 +59,25 @@ struct flow_dissector_key_vlan {
 	__be16	vlan_tpid;
 };
 
-struct flow_dissector_key_mpls {
+struct flow_dissector_mpls_lse {
 	u32	mpls_ttl:8,
 		mpls_bos:1,
 		mpls_tc:3,
 		mpls_label:20;
 };
 
+#define FLOW_DIS_MPLS_MAX 7
+struct flow_dissector_key_mpls {
+	struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */
+	u8 used_lses; /* One bit set for each Label Stack Entry in use */
+};
+
+static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls,
+					  int lse_index)
+{
+	mpls->used_lses |= 1 << lse_index;
+}
+
 #define FLOW_DIS_TUN_OPTS_MAX 255
 /**
  * struct flow_dissector_key_enc_opts:
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5dceed467f64..0aeb33572feb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -480,47 +480,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container, void *data, int nhoff, int hlen)
+			void *target_container, void *data, int nhoff, int hlen,
+			int lse_index, bool *entropy_label)
 {
-	struct flow_dissector_key_keyid *key_keyid;
-	struct mpls_label *hdr, _hdr[2];
-	u32 entry, label;
+	struct mpls_label *hdr, _hdr;
+	u32 entry, label, bos;
 
 	if (!dissector_uses_key(flow_dissector,
 				FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
 	    !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
 		return FLOW_DISSECT_RET_OUT_GOOD;
 
+	if (lse_index >= FLOW_DIS_MPLS_MAX)
+		return FLOW_DISSECT_RET_OUT_GOOD;
+
 	hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
 				   hlen, &_hdr);
 	if (!hdr)
 		return FLOW_DISSECT_RET_OUT_BAD;
 
-	entry = ntohl(hdr[0].entry);
+	entry = ntohl(hdr->entry);
 	label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+	bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
 
 	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
 		struct flow_dissector_key_mpls *key_mpls;
+		struct flow_dissector_mpls_lse *lse;
 
 		key_mpls = skb_flow_dissector_target(flow_dissector,
 						     FLOW_DISSECTOR_KEY_MPLS,
 						     target_container);
-		key_mpls->mpls_label = label;
-		key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
-					>> MPLS_LS_TTL_SHIFT;
-		key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
-					>> MPLS_LS_TC_SHIFT;
-		key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
-					>> MPLS_LS_S_SHIFT;
+		lse = &key_mpls->ls[lse_index];
+
+		lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+		lse->mpls_bos = bos;
+		lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+		lse->mpls_label = label;
+		dissector_set_mpls_lse(key_mpls, lse_index);
 	}
 
-	if (label == MPLS_LABEL_ENTROPY) {
+	if (*entropy_label &&
+	    dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+		struct flow_dissector_key_keyid *key_keyid;
+
 		key_keyid = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
 						      target_container);
-		key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+		key_keyid->keyid = cpu_to_be32(label);
 	}
-	return FLOW_DISSECT_RET_OUT_GOOD;
+
+	*entropy_label = label == MPLS_LABEL_ENTROPY;
+
+	return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
 static enum flow_dissect_ret
@@ -979,6 +991,8 @@ bool __skb_flow_dissect(const struct net *net,
 	struct bpf_prog *attached = NULL;
 	enum flow_dissect_ret fdret;
 	enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+	bool mpls_el = false;
+	int mpls_lse = 0;
 	int num_hdrs = 0;
 	u8 ip_proto = 0;
 	bool ret;
@@ -1278,7 +1292,10 @@ proto_again:
 	case htons(ETH_P_MPLS_MC):
 		fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
 						target_container, data,
-						nhoff, hlen);
+						nhoff, hlen, mpls_lse,
+						&mpls_el);
+		nhoff += sizeof(struct mpls_label);
+		mpls_lse++;
 		break;
 	case htons(ETH_P_FCOE):
 		if ((hlen - nhoff) < FCOE_HEADER_LEN) {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 0c574700da75..f524afe0b7f5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -781,9 +781,17 @@ static int fl_set_key_mpls(struct nlattr **tb,
 			   struct flow_dissector_key_mpls *key_mask,
 			   struct netlink_ext_ack *extack)
 {
+	struct flow_dissector_mpls_lse *lse_mask;
+	struct flow_dissector_mpls_lse *lse_val;
+
+	lse_val = &key_val->ls[0];
+	lse_mask = &key_mask->ls[0];
+
 	if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
-		key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
-		key_mask->mpls_ttl = MPLS_TTL_MASK;
+		lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
+		lse_mask->mpls_ttl = MPLS_TTL_MASK;
+		dissector_set_mpls_lse(key_val, 0);
+		dissector_set_mpls_lse(key_mask, 0);
 	}
 	if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
 		u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
@@ -794,8 +802,10 @@ static int fl_set_key_mpls(struct nlattr **tb,
 					    "Bottom Of Stack (BOS) must be 0 or 1");
 			return -EINVAL;
 		}
-		key_val->mpls_bos = bos;
-		key_mask->mpls_bos = MPLS_BOS_MASK;
+		lse_val->mpls_bos = bos;
+		lse_mask->mpls_bos = MPLS_BOS_MASK;
+		dissector_set_mpls_lse(key_val, 0);
+		dissector_set_mpls_lse(key_mask, 0);
 	}
 	if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
 		u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
@@ -806,8 +816,10 @@ static int fl_set_key_mpls(struct nlattr **tb,
 					    "Traffic Class (TC) must be between 0 and 7");
 			return -EINVAL;
 		}
-		key_val->mpls_tc = tc;
-		key_mask->mpls_tc = MPLS_TC_MASK;
+		lse_val->mpls_tc = tc;
+		lse_mask->mpls_tc = MPLS_TC_MASK;
+		dissector_set_mpls_lse(key_val, 0);
+		dissector_set_mpls_lse(key_mask, 0);
 	}
 	if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
 		u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
@@ -818,8 +830,10 @@ static int fl_set_key_mpls(struct nlattr **tb,
 					    "Label must be between 0 and 1048575");
 			return -EINVAL;
 		}
-		key_val->mpls_label = label;
-		key_mask->mpls_label = MPLS_LABEL_MASK;
+		lse_val->mpls_label = label;
+		lse_mask->mpls_label = MPLS_LABEL_MASK;
+		dissector_set_mpls_lse(key_val, 0);
+		dissector_set_mpls_lse(key_mask, 0);
 	}
 	return 0;
 }
@@ -2222,31 +2236,37 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
 			    struct flow_dissector_key_mpls *mpls_key,
 			    struct flow_dissector_key_mpls *mpls_mask)
 {
+	struct flow_dissector_mpls_lse *lse_mask;
+	struct flow_dissector_mpls_lse *lse_key;
 	int err;
 
 	if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask)))
 		return 0;
-	if (mpls_mask->mpls_ttl) {
+
+	lse_mask = &mpls_mask->ls[0];
+	lse_key = &mpls_key->ls[0];
+
+	if (lse_mask->mpls_ttl) {
 		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL,
-				 mpls_key->mpls_ttl);
+				 lse_key->mpls_ttl);
 		if (err)
 			return err;
 	}
-	if (mpls_mask->mpls_tc) {
+	if (lse_mask->mpls_tc) {
 		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC,
-				 mpls_key->mpls_tc);
+				 lse_key->mpls_tc);
 		if (err)
 			return err;
 	}
-	if (mpls_mask->mpls_label) {
+	if (lse_mask->mpls_label) {
 		err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL,
-				  mpls_key->mpls_label);
+				  lse_key->mpls_label);
 		if (err)
 			return err;
 	}
-	if (mpls_mask->mpls_bos) {
+	if (lse_mask->mpls_bos) {
 		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS,
-				 mpls_key->mpls_bos);
+				 lse_key->mpls_bos);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 61aec25a6db5d0c2e8ab5da6d2d152269d0d9d69 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 26 May 2020 14:29:04 +0200
Subject: cls_flower: Support filtering on multiple MPLS Label Stack Entries

With struct flow_dissector_key_mpls now recording the first
FLOW_DIS_MPLS_MAX labels, we can extend Flower to filter on any of
these LSEs independently.

In order to avoid creating new netlink attributes for every possible
depth, let's define a new TCA_FLOWER_KEY_MPLS_OPTS nested attribute
that contains the list of LSEs to match. Each LSE is represented by
another attribute, TCA_FLOWER_KEY_MPLS_OPTS_LSE, which then contains
the attributes representing the depth and the MPLS fields to match at
this depth (label, TTL, etc.).

For each MPLS field, the mask is always set to all-ones, as this is
what the original API did. We could allow user configurable masks in
the future if there is demand for more flexibility.

The new API also allows to only specify an LSE depth. In that case,
Flower only verifies that the MPLS label stack depth is greater or
equal to the provided depth (that is, an LSE exists at this depth).

Filters that only match on one (or more) fields of the first LSE are
dumped using the old netlink attributes, to avoid confusing user space
programs that don't understand the new API.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  23 ++++
 net/sched/cls_flower.c       | 243 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 265 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index fc672b232437..7576209d96f9 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -576,6 +576,8 @@ enum {
 	TCA_FLOWER_KEY_CT_LABELS,	/* u128 */
 	TCA_FLOWER_KEY_CT_LABELS_MASK,	/* u128 */
 
+	TCA_FLOWER_KEY_MPLS_OPTS,
+
 	__TCA_FLOWER_MAX,
 };
 
@@ -640,6 +642,27 @@ enum {
 #define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \
 		(__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC,
+	TCA_FLOWER_KEY_MPLS_OPTS_LSE,
+	__TCA_FLOWER_KEY_MPLS_OPTS_MAX,
+};
+
+#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC,
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
+	TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
+	__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX,
+};
+
+#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \
+		(__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f524afe0b7f5..96f5999281e0 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -668,6 +668,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_BOS]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_TC]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_LABEL]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_MPLS_OPTS]	= { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_TCP_FLAGS]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_TCP_FLAGS_MASK]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_IP_TOS]		= { .type = NLA_U8 },
@@ -726,6 +727,20 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]       = { .type = NLA_U8 },
 };
 
+static const struct nla_policy
+mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = {
+	[TCA_FLOWER_KEY_MPLS_OPTS_LSE]    = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
+	[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]    = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]      = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]      = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]       = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]    = { .type = NLA_U32 },
+};
+
 static void fl_set_key_val(struct nlattr **tb,
 			   void *val, int val_type,
 			   void *mask, int mask_type, int len)
@@ -776,6 +791,126 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
 	return 0;
 }
 
+static int fl_set_key_mpls_lse(const struct nlattr *nla_lse,
+			       struct flow_dissector_key_mpls *key_val,
+			       struct flow_dissector_key_mpls *key_mask,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1];
+	struct flow_dissector_mpls_lse *lse_mask;
+	struct flow_dissector_mpls_lse *lse_val;
+	u8 lse_index;
+	u8 depth;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse,
+			       mpls_stack_entry_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) {
+		NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\"");
+		return -EINVAL;
+	}
+
+	depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]);
+
+	/* LSE depth starts at 1, for consistency with terminology used by
+	 * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets.
+	 */
+	if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) {
+		NL_SET_ERR_MSG_ATTR(extack,
+				    tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH],
+				    "Invalid MPLS depth");
+		return -EINVAL;
+	}
+	lse_index = depth - 1;
+
+	dissector_set_mpls_lse(key_val, lse_index);
+	dissector_set_mpls_lse(key_mask, lse_index);
+
+	lse_val = &key_val->ls[lse_index];
+	lse_mask = &key_mask->ls[lse_index];
+
+	if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) {
+		lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]);
+		lse_mask->mpls_ttl = MPLS_TTL_MASK;
+	}
+	if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) {
+		u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]);
+
+		if (bos & ~MPLS_BOS_MASK) {
+			NL_SET_ERR_MSG_ATTR(extack,
+					    tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS],
+					    "Bottom Of Stack (BOS) must be 0 or 1");
+			return -EINVAL;
+		}
+		lse_val->mpls_bos = bos;
+		lse_mask->mpls_bos = MPLS_BOS_MASK;
+	}
+	if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) {
+		u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]);
+
+		if (tc & ~MPLS_TC_MASK) {
+			NL_SET_ERR_MSG_ATTR(extack,
+					    tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC],
+					    "Traffic Class (TC) must be between 0 and 7");
+			return -EINVAL;
+		}
+		lse_val->mpls_tc = tc;
+		lse_mask->mpls_tc = MPLS_TC_MASK;
+	}
+	if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) {
+		u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]);
+
+		if (label & ~MPLS_LABEL_MASK) {
+			NL_SET_ERR_MSG_ATTR(extack,
+					    tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL],
+					    "Label must be between 0 and 1048575");
+			return -EINVAL;
+		}
+		lse_val->mpls_label = label;
+		lse_mask->mpls_label = MPLS_LABEL_MASK;
+	}
+
+	return 0;
+}
+
+static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts,
+				struct flow_dissector_key_mpls *key_val,
+				struct flow_dissector_key_mpls *key_mask,
+				struct netlink_ext_ack *extack)
+{
+	struct nlattr *nla_lse;
+	int rem;
+	int err;
+
+	if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) {
+		NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts,
+				    "NLA_F_NESTED is missing");
+		return -EINVAL;
+	}
+
+	nla_for_each_nested(nla_lse, nla_mpls_opts, rem) {
+		if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) {
+			NL_SET_ERR_MSG_ATTR(extack, nla_lse,
+					    "Invalid MPLS option type");
+			return -EINVAL;
+		}
+
+		err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack);
+		if (err < 0)
+			return err;
+	}
+	if (rem) {
+		NL_SET_ERR_MSG(extack,
+			       "Bytes leftover after parsing MPLS options");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int fl_set_key_mpls(struct nlattr **tb,
 			   struct flow_dissector_key_mpls *key_val,
 			   struct flow_dissector_key_mpls *key_mask,
@@ -784,6 +919,21 @@ static int fl_set_key_mpls(struct nlattr **tb,
 	struct flow_dissector_mpls_lse *lse_mask;
 	struct flow_dissector_mpls_lse *lse_val;
 
+	if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) {
+		if (tb[TCA_FLOWER_KEY_MPLS_TTL] ||
+		    tb[TCA_FLOWER_KEY_MPLS_BOS] ||
+		    tb[TCA_FLOWER_KEY_MPLS_TC] ||
+		    tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
+			NL_SET_ERR_MSG_ATTR(extack,
+					    tb[TCA_FLOWER_KEY_MPLS_OPTS],
+					    "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute");
+			return -EBADMSG;
+		}
+
+		return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS],
+					    key_val, key_mask, extack);
+	}
+
 	lse_val = &key_val->ls[0];
 	lse_mask = &key_mask->ls[0];
 
@@ -2232,6 +2382,89 @@ static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
 	return 0;
 }
 
+static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb,
+				    struct flow_dissector_key_mpls *mpls_key,
+				    struct flow_dissector_key_mpls *mpls_mask,
+				    u8 lse_index)
+{
+	struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index];
+	struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index];
+	int err;
+
+	err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
+			 lse_index + 1);
+	if (err)
+		return err;
+
+	if (lse_mask->mpls_ttl) {
+		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
+				 lse_key->mpls_ttl);
+		if (err)
+			return err;
+	}
+	if (lse_mask->mpls_bos) {
+		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
+				 lse_key->mpls_bos);
+		if (err)
+			return err;
+	}
+	if (lse_mask->mpls_tc) {
+		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
+				 lse_key->mpls_tc);
+		if (err)
+			return err;
+	}
+	if (lse_mask->mpls_label) {
+		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
+				 lse_key->mpls_label);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int fl_dump_key_mpls_opts(struct sk_buff *skb,
+				 struct flow_dissector_key_mpls *mpls_key,
+				 struct flow_dissector_key_mpls *mpls_mask)
+{
+	struct nlattr *opts;
+	struct nlattr *lse;
+	u8 lse_index;
+	int err;
+
+	opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS);
+	if (!opts)
+		return -EMSGSIZE;
+
+	for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) {
+		if (!(mpls_mask->used_lses & 1 << lse_index))
+			continue;
+
+		lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE);
+		if (!lse) {
+			err = -EMSGSIZE;
+			goto err_opts;
+		}
+
+		err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask,
+					       lse_index);
+		if (err)
+			goto err_opts_lse;
+		nla_nest_end(skb, lse);
+	}
+	nla_nest_end(skb, opts);
+
+	return 0;
+
+err_opts_lse:
+	nla_nest_cancel(skb, lse);
+err_opts:
+	nla_nest_cancel(skb, opts);
+
+	return err;
+}
+
 static int fl_dump_key_mpls(struct sk_buff *skb,
 			    struct flow_dissector_key_mpls *mpls_key,
 			    struct flow_dissector_key_mpls *mpls_mask)
@@ -2240,12 +2473,20 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
 	struct flow_dissector_mpls_lse *lse_key;
 	int err;
 
-	if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask)))
+	if (!mpls_mask->used_lses)
 		return 0;
 
 	lse_mask = &mpls_mask->ls[0];
 	lse_key = &mpls_key->ls[0];
 
+	/* For backward compatibility, don't use the MPLS nested attributes if
+	 * the rule can be expressed using the old attributes.
+	 */
+	if (mpls_mask->used_lses & ~1 ||
+	    (!lse_mask->mpls_ttl && !lse_mask->mpls_bos &&
+	     !lse_mask->mpls_tc && !lse_mask->mpls_label))
+		return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask);
+
 	if (lse_mask->mpls_ttl) {
 		err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL,
 				 lse_key->mpls_ttl);
-- 
cgit v1.2.3


From 09d0310f07672b7e3e60cf719b96c803f4830e5c Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 25 May 2020 17:31:58 +0200
Subject: net/smc: mark smc_pnet_policy as const

Netlink policies are generally declared as const.
This is safer and prevents potential bugs.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index be03f1260d59..014d91b9778e 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -32,7 +32,7 @@
 
 static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
 
-static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
 	[SMC_PNETID_NAME] = {
 		.type = NLA_NUL_STRING,
 		.len = SMC_MAX_PNETID_LEN
-- 
cgit v1.2.3


From 4e637c70b503b686aae45716a25a94dc3a434f3a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 May 2020 23:41:13 +0200
Subject: mptcp: attempt coalescing when moving skbs to mptcp rx queue

We can try to coalesce skbs we take from the subflows rx queue with the
tail of the mptcp rx queue.

If successful, the skb head can be discarded early.

We can also free the skb extensions, we do not access them after this.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ba9d3d5c625f..b2c8b57e7942 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 			     unsigned int offset, size_t copy_len)
 {
 	struct sock *sk = (struct sock *)msk;
+	struct sk_buff *tail;
 
 	__skb_unlink(skb, &ssk->sk_receive_queue);
-	skb_set_owner_r(skb, sk);
-	__skb_queue_tail(&sk->sk_receive_queue, skb);
 
+	skb_ext_reset(skb);
+	skb_orphan(skb);
 	msk->ack_seq += copy_len;
+
+	tail = skb_peek_tail(&sk->sk_receive_queue);
+	if (offset == 0 && tail) {
+		bool fragstolen;
+		int delta;
+
+		if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+			kfree_skb_partial(skb, fragstolen);
+			atomic_add(delta, &sk->sk_rmem_alloc);
+			sk_mem_charge(sk, delta);
+			return;
+		}
+	}
+
+	skb_set_owner_r(skb, sk);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	MPTCP_SKB_CB(skb)->offset = offset;
 }
 
-- 
cgit v1.2.3


From 1a644de29f712771c2ec00e52caa391544eb6141 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 27 May 2020 00:21:38 +0200
Subject: net: ethtool: Add generic parts of cable test TDR

Add the generic parts of the code used to trigger a cable test and
return raw TDR data. Any PHY driver which support this must implement
the new driver op.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>

v2
Update nxp-tja11xx for API change.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/nxp-tja11xx.c   |  2 +-
 drivers/net/phy/phy.c           | 65 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/ethtool_netlink.h |  4 +--
 include/linux/phy.h             | 13 +++++++++
 net/ethtool/cabletest.c         | 64 ++++++++++++++++++++++++++++++++++++----
 net/ethtool/netlink.c           |  5 ++++
 net/ethtool/netlink.h           |  1 +
 7 files changed, 144 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c
index 1e79c30ca81a..a72fa0d2e7c7 100644
--- a/drivers/net/phy/nxp-tja11xx.c
+++ b/drivers/net/phy/nxp-tja11xx.c
@@ -194,7 +194,7 @@ static int tja11xx_config_aneg_cable_test(struct phy_device *phydev)
 	    !phydev->drv->cable_test_get_status)
 		return 0;
 
-	ret = ethnl_cable_test_alloc(phydev);
+	ret = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF);
 	if (ret)
 		return ret;
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 27da0c94818f..495d9ba3d5bf 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -519,7 +519,7 @@ int phy_start_cable_test(struct phy_device *phydev,
 		goto out;
 	}
 
-	err = ethnl_cable_test_alloc(phydev);
+	err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_NTF);
 	if (err)
 		goto out;
 
@@ -552,6 +552,69 @@ out:
 }
 EXPORT_SYMBOL(phy_start_cable_test);
 
+int phy_start_cable_test_tdr(struct phy_device *phydev,
+			     struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = phydev->attached_dev;
+	int err = -ENOMEM;
+
+	if (!(phydev->drv &&
+	      phydev->drv->cable_test_tdr_start &&
+	      phydev->drv->cable_test_get_status)) {
+		NL_SET_ERR_MSG(extack,
+			       "PHY driver does not support cable test TDR");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&phydev->lock);
+	if (phydev->state == PHY_CABLETEST) {
+		NL_SET_ERR_MSG(extack,
+			       "PHY already performing a test");
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (phydev->state < PHY_UP ||
+	    phydev->state > PHY_CABLETEST) {
+		NL_SET_ERR_MSG(extack,
+			       "PHY not configured. Try setting interface up");
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = ethnl_cable_test_alloc(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF);
+	if (err)
+		goto out;
+
+	/* Mark the carrier down until the test is complete */
+	phy_link_down(phydev);
+
+	netif_testing_on(dev);
+	err = phydev->drv->cable_test_tdr_start(phydev);
+	if (err) {
+		netif_testing_off(dev);
+		phy_link_up(phydev);
+		goto out_free;
+	}
+
+	phydev->state = PHY_CABLETEST;
+
+	if (phy_polling_mode(phydev))
+		phy_trigger_machine(phydev);
+
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+
+out_free:
+	ethnl_cable_test_free(phydev);
+out:
+	mutex_unlock(&phydev->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(phy_start_cable_test_tdr);
+
 static int phy_config_aneg(struct phy_device *phydev)
 {
 	if (phydev->drv->config_aneg)
diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index e317fc99565e..24817ba252a0 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -17,13 +17,13 @@ enum ethtool_multicast_groups {
 struct phy_device;
 
 #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
-int ethnl_cable_test_alloc(struct phy_device *phydev);
+int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd);
 void ethnl_cable_test_free(struct phy_device *phydev);
 void ethnl_cable_test_finished(struct phy_device *phydev);
 int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result);
 int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm);
 #else
-static inline int ethnl_cable_test_alloc(struct phy_device *phydev)
+static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6d256e720a66..d3c384f353ca 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -699,6 +699,10 @@ struct phy_driver {
 
 	/* Start a cable test */
 	int (*cable_test_start)(struct phy_device *dev);
+
+	/* Start a raw TDR cable test */
+	int (*cable_test_tdr_start)(struct phy_device *dev);
+
 	/* Once per second, or on interrupt, request the status of the
 	 * test.
 	 */
@@ -1251,6 +1255,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev);
 #if IS_ENABLED(CONFIG_PHYLIB)
 int phy_start_cable_test(struct phy_device *phydev,
 			 struct netlink_ext_ack *extack);
+int phy_start_cable_test_tdr(struct phy_device *phydev,
+			     struct netlink_ext_ack *extack);
 #else
 static inline
 int phy_start_cable_test(struct phy_device *phydev,
@@ -1259,6 +1265,13 @@ int phy_start_cable_test(struct phy_device *phydev,
 	NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support");
 	return -EOPNOTSUPP;
 }
+static inline
+int phy_start_cable_test_tdr(struct phy_device *phydev,
+			     struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support");
+	return -EOPNOTSUPP;
+}
 #endif
 
 int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result);
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 5ba06eabe8c2..94e9d5f04353 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -13,7 +13,7 @@ cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = {
 	[ETHTOOL_A_CABLE_TEST_HEADER]		= { .type = NLA_NESTED },
 };
 
-static int ethnl_cable_test_started(struct phy_device *phydev)
+static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd)
 {
 	struct sk_buff *skb;
 	int err = -ENOMEM;
@@ -23,7 +23,7 @@ static int ethnl_cable_test_started(struct phy_device *phydev)
 	if (!skb)
 		goto out;
 
-	ehdr = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_CABLE_TEST_NTF);
+	ehdr = ethnl_bcastmsg_put(skb, cmd);
 	if (!ehdr) {
 		err = -EMSGSIZE;
 		goto out;
@@ -86,7 +86,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
 	ethnl_ops_complete(dev);
 
 	if (!ret)
-		ethnl_cable_test_started(dev->phydev);
+		ethnl_cable_test_started(dev->phydev,
+					 ETHTOOL_MSG_CABLE_TEST_NTF);
 
 out_rtnl:
 	rtnl_unlock();
@@ -95,7 +96,7 @@ out_dev_put:
 	return ret;
 }
 
-int ethnl_cable_test_alloc(struct phy_device *phydev)
+int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
 {
 	int err = -ENOMEM;
 
@@ -103,8 +104,7 @@ int ethnl_cable_test_alloc(struct phy_device *phydev)
 	if (!phydev->skb)
 		goto out;
 
-	phydev->ehdr = ethnl_bcastmsg_put(phydev->skb,
-					  ETHTOOL_MSG_CABLE_TEST_NTF);
+	phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd);
 	if (!phydev->ehdr) {
 		err = -EMSGSIZE;
 		goto out;
@@ -199,3 +199,55 @@ err:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length);
+
+static const struct nla_policy
+cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = {
+	[ETHTOOL_A_CABLE_TEST_TDR_UNSPEC]	= { .type = NLA_REJECT },
+	[ETHTOOL_A_CABLE_TEST_TDR_HEADER]	= { .type = NLA_NESTED },
+};
+
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1];
+	struct ethnl_req_info req_info = {};
+	struct net_device *dev;
+	int ret;
+
+	ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+			  ETHTOOL_A_CABLE_TEST_TDR_MAX,
+			  cable_test_tdr_act_policy, info->extack);
+	if (ret < 0)
+		return ret;
+
+	ret = ethnl_parse_header_dev_get(&req_info,
+					 tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+
+	dev = req_info.dev;
+	if (!dev->phydev) {
+		ret = -EOPNOTSUPP;
+		goto out_dev_put;
+	}
+
+	rtnl_lock();
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto out_rtnl;
+
+	ret = phy_start_cable_test_tdr(dev->phydev, info->extack);
+
+	ethnl_ops_complete(dev);
+
+	if (!ret)
+		ethnl_cable_test_started(dev->phydev,
+					 ETHTOOL_MSG_CABLE_TEST_TDR_NTF);
+
+out_rtnl:
+	rtnl_unlock();
+out_dev_put:
+	dev_put(dev);
+	return ret;
+}
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 0f2f4754dcf9..88fd07f47040 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -844,6 +844,11 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.flags	= GENL_UNS_ADMIN_PERM,
 		.doit	= ethnl_act_cable_test,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_act_cable_test_tdr,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index b0eb5d920099..9a96b6e90dc2 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -360,5 +360,6 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info);
 int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
-- 
cgit v1.2.3


From 6b4a0fc106521e480c00b55a7ef38c89f02dc4e8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 27 May 2020 00:21:39 +0200
Subject: net: ethtool: Add helpers for cable test TDR data

Add helpers for returning raw TDR helpers in netlink messages.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool_netlink.h | 21 +++++++++++
 net/ethtool/cabletest.c         | 80 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 24817ba252a0..8fbe4f97ffad 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -22,6 +22,10 @@ void ethnl_cable_test_free(struct phy_device *phydev);
 void ethnl_cable_test_finished(struct phy_device *phydev);
 int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result);
 int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm);
+int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV);
+int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV);
+int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last,
+			  u32 step);
 #else
 static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
 {
@@ -46,5 +50,22 @@ static inline int ethnl_cable_test_fault_length(struct phy_device *phydev,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int ethnl_cable_test_amplitude(struct phy_device *phydev,
+					     u8 pair, s16 mV)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first,
+					u32 last, u32 step)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* IS_ENABLED(ETHTOOL_NETLINK) */
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 94e9d5f04353..390d0673ff01 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -100,7 +100,10 @@ int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
 {
 	int err = -ENOMEM;
 
-	phydev->skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	/* One TDR sample occupies 20 bytes. For a 150 meter cable,
+	 * with four pairs, around 12K is needed.
+	 */
+	phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL);
 	if (!phydev->skb)
 		goto out;
 
@@ -251,3 +254,78 @@ out_dev_put:
 	dev_put(dev);
 	return ret;
 }
+ 
+int ethnl_cable_test_amplitude(struct phy_device *phydev,
+			       u8 pair, s16 mV)
+{
+	struct nlattr *nest;
+	int ret = -EMSGSIZE;
+
+	nest = nla_nest_start(phydev->skb,
+			      ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair))
+		goto err;
+	if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV))
+		goto err;
+
+	nla_nest_end(phydev->skb, nest);
+	return 0;
+
+err:
+	nla_nest_cancel(phydev->skb, nest);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude);
+
+int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV)
+{
+	struct nlattr *nest;
+	int ret = -EMSGSIZE;
+
+	nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV))
+		goto err;
+
+	nla_nest_end(phydev->skb, nest);
+	return 0;
+
+err:
+	nla_nest_cancel(phydev->skb, nest);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse);
+
+int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last,
+			  u32 step)
+{
+	struct nlattr *nest;
+	int ret = -EMSGSIZE;
+
+	nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE,
+			first))
+		goto err;
+
+	if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last))
+		goto err;
+
+	if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step))
+		goto err;
+
+	nla_nest_end(phydev->skb, nest);
+	return 0;
+
+err:
+	nla_nest_cancel(phydev->skb, nest);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_step);
-- 
cgit v1.2.3


From f2bc8ad31a7f814237bc6301d59296d76505a688 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 27 May 2020 00:21:41 +0200
Subject: net: ethtool: Allow PHY cable test TDR data to configured

Allow the user to configure where on the cable the TDR data should be
retrieved, in terms of first and last sample, and the step between
samples. Also add the ability to ask for TDR data for just one pair.

If this configuration is not provided, it defaults to 1-150m at 1m
intervals for all pairs.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>

v3:
Move the TDR configuration into a structure
Add a range check on step
Use NL_SET_ERR_MSG_ATTR() when appropriate
Move TDR configuration into a nest
Document attributes in the request

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  22 +++++-
 drivers/net/phy/marvell.c                    |  59 ++++++++++-----
 drivers/net/phy/phy.c                        |   5 +-
 include/linux/phy.h                          |  21 +++++-
 include/uapi/linux/ethtool_netlink.h         |  13 ++++
 net/ethtool/cabletest.c                      | 104 ++++++++++++++++++++++++++-
 6 files changed, 197 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index dae36227d590..d42661b91128 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1023,9 +1023,25 @@ Start a cable test and report raw TDR data
 
 Request contents:
 
-  ====================================  ======  ==========================
-  ``ETHTOOL_A_CABLE_TEST_TDR_HEADER``   nested  request header
-  ====================================  ======  ==========================
+ +--------------------------------------------+--------+-----------------------+
+ | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER``        | nested | reply header          |
+ +--------------------------------------------+--------+-----------------------+
+ | ``ETHTOOL_A_CABLE_TEST_TDR_CFG``           | nested | test configuration    |
+ +-+------------------------------------------+--------+-----------------------+
+ | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32    | first data distance   |
+ +-+-+----------------------------------------+--------+-----------------------+
+ | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE ``  | u32    | last data distance    |
+ +-+-+----------------------------------------+--------+-----------------------+
+ | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE ``  | u32    | distance of each step |
+ +-+-+----------------------------------------+--------+-----------------------+
+ | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR``    | u8     | pair to test          |
+ +-+-+----------------------------------------+--------+-----------------------+
+
+The ETHTOOL_A_CABLE_TEST_TDR_CFG is optional, as well as all members
+of the nest. All distances are expressed in centimeters. The PHY takes
+the distances as a guide, and rounds to the nearest distance it
+actually supports. If a pair is passed, only that one pair will be
+tested. Otherwise all pairs are tested.
 
 Notification contents:
 
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index e597bee2e966..335e51d6f138 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -198,6 +198,7 @@
 #define MII_VCT5_CTRL_PEEK_HYST_DEFAULT			3
 
 #define MII_VCT5_SAMPLE_POINT_DISTANCE		0x18
+#define MII_VCT5_SAMPLE_POINT_DISTANCE_MAX	511
 #define MII_VCT5_TX_PULSE_CTRL			0x1c
 #define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN	BIT(12)
 #define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS	(0x0 << 10)
@@ -270,6 +271,10 @@ struct marvell_priv {
 	char *hwmon_name;
 	struct device *hwmon_dev;
 	bool cable_test_tdr;
+	u32 first;
+	u32 last;
+	u32 step;
+	s8 pair;
 };
 
 static int marvell_read_page(struct phy_device *phydev)
@@ -1787,12 +1792,18 @@ static u32 marvell_vct5_distance2cm(int distance)
 	return distance * 805 / 10;
 }
 
+static u32 marvell_vct5_cm2distance(int cm)
+{
+	return cm * 10 / 805;
+}
+
 static int marvell_vct5_amplitude_distance(struct phy_device *phydev,
-					   int distance)
+					   int distance, int pair)
 {
-	int mV_pair0, mV_pair1, mV_pair2, mV_pair3;
 	u16 reg;
 	int err;
+	int mV;
+	int i;
 
 	err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE,
 			      MII_VCT5_SAMPLE_POINT_DISTANCE,
@@ -1814,21 +1825,20 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev,
 	if (err)
 		return err;
 
-	mV_pair0 = marvell_vct5_amplitude(phydev, 0);
-	mV_pair1 = marvell_vct5_amplitude(phydev, 1);
-	mV_pair2 = marvell_vct5_amplitude(phydev, 2);
-	mV_pair3 = marvell_vct5_amplitude(phydev, 3);
+	for (i = 0; i < 4; i++) {
+		if (pair != PHY_PAIR_ALL && i != pair)
+			continue;
 
-	ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0);
-	ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1);
-	ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2);
-	ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3);
+		mV = marvell_vct5_amplitude(phydev, i);
+		ethnl_cable_test_amplitude(phydev, i, mV);
+	}
 
 	return 0;
 }
 
 static int marvell_vct5_amplitude_graph(struct phy_device *phydev)
 {
+	struct marvell_priv *priv = phydev->priv;
 	int distance;
 	int err;
 	u16 reg;
@@ -1843,8 +1853,11 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev)
 	if (err)
 		return err;
 
-	for (distance = 0; distance <= 100; distance++) {
-		err = marvell_vct5_amplitude_distance(phydev, distance);
+	for (distance = priv->first;
+	     distance <= priv->last;
+	     distance += priv->step) {
+		err = marvell_vct5_amplitude_distance(phydev, distance,
+						      priv->pair);
 		if (err)
 			return err;
 	}
@@ -1918,11 +1931,24 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev)
 			       MII_VCT7_CTRL_CENTIMETERS);
 }
 
-static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev)
+static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev,
+					     const struct phy_tdr_config *cfg)
 {
 	struct marvell_priv *priv = phydev->priv;
 	int ret;
 
+	priv->cable_test_tdr = true;
+	priv->first = marvell_vct5_cm2distance(cfg->first);
+	priv->last = marvell_vct5_cm2distance(cfg->last);
+	priv->step = marvell_vct5_cm2distance(cfg->step);
+	priv->pair = cfg->pair;
+
+	if (priv->first > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX)
+		return -EINVAL;
+
+	if (priv->last > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX)
+		return -EINVAL;
+
 	/* Disable  VCT7 */
 	ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE,
 			      MII_VCT7_CTRL, 0);
@@ -1933,15 +1959,14 @@ static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev)
 	if (ret)
 		return ret;
 
-	priv->cable_test_tdr = true;
 	ret = ethnl_cable_test_pulse(phydev, 1000);
 	if (ret)
 		return ret;
 
 	return ethnl_cable_test_step(phydev,
-				     marvell_vct5_distance2cm(0),
-				     marvell_vct5_distance2cm(100),
-				     marvell_vct5_distance2cm(1));
+				     marvell_vct5_distance2cm(priv->first),
+				     marvell_vct5_distance2cm(priv->last),
+				     marvell_vct5_distance2cm(priv->step));
 }
 
 static int marvell_vct7_distance_to_length(int distance, bool meter)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 495d9ba3d5bf..1de3938628f4 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -553,7 +553,8 @@ out:
 EXPORT_SYMBOL(phy_start_cable_test);
 
 int phy_start_cable_test_tdr(struct phy_device *phydev,
-			     struct netlink_ext_ack *extack)
+			     struct netlink_ext_ack *extack,
+			     const struct phy_tdr_config *config)
 {
 	struct net_device *dev = phydev->attached_dev;
 	int err = -ENOMEM;
@@ -590,7 +591,7 @@ int phy_start_cable_test_tdr(struct phy_device *phydev,
 	phy_link_down(phydev);
 
 	netif_testing_on(dev);
-	err = phydev->drv->cable_test_tdr_start(phydev);
+	err = phydev->drv->cable_test_tdr_start(phydev, config);
 	if (err) {
 		netif_testing_off(dev);
 		phy_link_up(phydev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d3c384f353ca..8c05d0fb5c00 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -548,6 +548,18 @@ struct phy_device {
 #define to_phy_device(d) container_of(to_mdio_device(d), \
 				      struct phy_device, mdio)
 
+/* A structure containing possible configuration parameters
+ * for a TDR cable test. The driver does not need to implement
+ * all the parameters, but should report what is actually used.
+ */
+struct phy_tdr_config {
+	u32 first;
+	u32 last;
+	u32 step;
+	s8 pair;
+};
+#define PHY_PAIR_ALL -1
+
 /* struct phy_driver: Driver structure for a particular PHY type
  *
  * driver_data: static driver data
@@ -701,7 +713,8 @@ struct phy_driver {
 	int (*cable_test_start)(struct phy_device *dev);
 
 	/* Start a raw TDR cable test */
-	int (*cable_test_tdr_start)(struct phy_device *dev);
+	int (*cable_test_tdr_start)(struct phy_device *dev,
+				    const struct phy_tdr_config *config);
 
 	/* Once per second, or on interrupt, request the status of the
 	 * test.
@@ -1256,7 +1269,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev);
 int phy_start_cable_test(struct phy_device *phydev,
 			 struct netlink_ext_ack *extack);
 int phy_start_cable_test_tdr(struct phy_device *phydev,
-			     struct netlink_ext_ack *extack);
+			     struct netlink_ext_ack *extack,
+			     const struct phy_tdr_config *config);
 #else
 static inline
 int phy_start_cable_test(struct phy_device *phydev,
@@ -1267,7 +1281,8 @@ int phy_start_cable_test(struct phy_device *phydev,
 }
 static inline
 int phy_start_cable_test_tdr(struct phy_device *phydev,
-			     struct netlink_ext_ack *extack)
+			     struct netlink_ext_ack *extack,
+			     const struct phy_tdr_config *config)
 {
 	NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support");
 	return -EOPNOTSUPP;
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 739faa7070c6..fc9051f2eeac 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -482,9 +482,22 @@ enum {
 
 /* CABLE TEST TDR */
 
+enum {
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC,
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST,		/* u32 */
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST,		/* u32 */
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP,		/* u32 */
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR,		/* u8 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT,
+	ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1
+};
+
 enum {
 	ETHTOOL_A_CABLE_TEST_TDR_UNSPEC,
 	ETHTOOL_A_CABLE_TEST_TDR_HEADER,	/* nest - _A_HEADER_* */
+	ETHTOOL_A_CABLE_TEST_TDR_CFG,		/* nest - *_TDR_CFG_* */
 
 	/* add new constants above here */
 	__ETHTOOL_A_CABLE_TEST_TDR_CNT,
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 390d0673ff01..9991688d7d1d 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -5,7 +5,11 @@
 #include "netlink.h"
 #include "common.h"
 
-/* CABLE_TEST_ACT */
+/* 802.3 standard allows 100 meters for BaseT cables. However longer
+ * cables might work, depending on the quality of the cables and the
+ * PHY. So allow testing for up to 150 meters.
+ */
+#define MAX_CABLE_LENGTH_CM (150 * 100)
 
 static const struct nla_policy
 cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = {
@@ -203,16 +207,107 @@ err:
 }
 EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length);
 
+struct cable_test_tdr_req_info {
+	struct ethnl_req_info		base;
+};
+
+static const struct nla_policy
+cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = {
+	[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]	= { .type = NLA_U32 },
+	[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]	= { .type = NLA_U32 },
+	[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]	= { .type = NLA_U32 },
+	[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]	= { .type = NLA_U8 },
+};
+
 static const struct nla_policy
 cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = {
 	[ETHTOOL_A_CABLE_TEST_TDR_UNSPEC]	= { .type = NLA_REJECT },
 	[ETHTOOL_A_CABLE_TEST_TDR_HEADER]	= { .type = NLA_NESTED },
+	[ETHTOOL_A_CABLE_TEST_TDR_CFG]		= { .type = NLA_NESTED },
 };
 
+/* CABLE_TEST_TDR_ACT */
+int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
+				 struct genl_info *info,
+				 struct phy_tdr_config *cfg)
+{
+	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1];
+	int ret;
+
+	ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest,
+			       cable_test_tdr_act_cfg_policy, info->extack);
+	if (ret < 0)
+		return ret;
+
+	if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST])
+		cfg->first = nla_get_u32(
+			tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]);
+	else
+		cfg->first = 100;
+	if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST])
+		cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]);
+	else
+		cfg->last = MAX_CABLE_LENGTH_CM;
+
+	if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP])
+		cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]);
+	else
+		cfg->step = 100;
+
+	if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) {
+		cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]);
+		if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) {
+			NL_SET_ERR_MSG_ATTR(
+				info->extack,
+				tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR],
+				"invalid pair parameter");
+			return -EINVAL;
+		}
+	} else {
+		cfg->pair = PHY_PAIR_ALL;
+	}
+
+	if (cfg->first > MAX_CABLE_LENGTH_CM) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST],
+				    "invalid first parameter");
+		return -EINVAL;
+	}
+
+	if (cfg->last > MAX_CABLE_LENGTH_CM) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST],
+				    "invalid last parameter");
+		return -EINVAL;
+	}
+
+	if (cfg->first > cfg->last) {
+		NL_SET_ERR_MSG(info->extack, "invalid first/last parameter");
+		return -EINVAL;
+	}
+
+	if (!cfg->step) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+				    "invalid step parameter");
+		return -EINVAL;
+	}
+
+	if (cfg->step > (cfg->last - cfg->first)) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+				    "step parameter too big");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1];
 	struct ethnl_req_info req_info = {};
+	struct phy_tdr_config cfg;
 	struct net_device *dev;
 	int ret;
 
@@ -235,12 +330,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
 		goto out_dev_put;
 	}
 
+	ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG],
+					   info, &cfg);
+	if (ret)
+		goto out_dev_put;
+
 	rtnl_lock();
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		goto out_rtnl;
 
-	ret = phy_start_cable_test_tdr(dev->phydev, info->extack);
+	ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg);
 
 	ethnl_ops_complete(dev);
 
-- 
cgit v1.2.3


From 60c2ef0ef07f319504763eaaed8cb003af879008 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Fri, 24 Apr 2020 14:29:02 +0300
Subject: mac80211: fix variable names in TID config methods

Fix all variable names from 'tid' to 'tids' to avoid confusion.
Now this is not TID number, but TID mask.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200424112905.26770-3-sergey.matyukevich.os@quantenna.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c        | 6 +++---
 net/mac80211/driver-ops.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 548a384b0509..06a2b7640a9d 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3957,7 +3957,7 @@ static int ieee80211_set_tid_config(struct wiphy *wiphy,
 
 static int ieee80211_reset_tid_config(struct wiphy *wiphy,
 				      struct net_device *dev,
-				      const u8 *peer, u8 tid)
+				      const u8 *peer, u8 tids)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct sta_info *sta;
@@ -3967,7 +3967,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy,
 		return -EOPNOTSUPP;
 
 	if (!peer)
-		return drv_reset_tid_config(sdata->local, sdata, NULL, tid);
+		return drv_reset_tid_config(sdata->local, sdata, NULL, tids);
 
 	mutex_lock(&sdata->local->sta_mtx);
 	sta = sta_info_get_bss(sdata, peer);
@@ -3976,7 +3976,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy,
 		return -ENOENT;
 	}
 
-	ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tid);
+	ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids);
 	mutex_unlock(&sdata->local->sta_mtx);
 
 	return ret;
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 3877710e3b48..de69fc9c4f07 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1375,12 +1375,12 @@ static inline int drv_set_tid_config(struct ieee80211_local *local,
 
 static inline int drv_reset_tid_config(struct ieee80211_local *local,
 				       struct ieee80211_sub_if_data *sdata,
-				       struct ieee80211_sta *sta, u8 tid)
+				       struct ieee80211_sta *sta, u8 tids)
 {
 	int ret;
 
 	might_sleep();
-	ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tid);
+	ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids);
 	trace_drv_return_int(local, ret);
 
 	return ret;
-- 
cgit v1.2.3


From 33462e68231bccfe563a87614f4c4dd5d333837c Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Fri, 24 Apr 2020 14:29:03 +0300
Subject: cfg80211: add support for TID specific AMSDU configuration

This patch adds support to control per TID MSDU aggregation
using the NL80211_TID_CONFIG_ATTR_AMSDU_CTRL attribute.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200424112905.26770-4-sergey.matyukevich.os@quantenna.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 +++-
 include/uapi/linux/nl80211.h | 10 +++++++---
 net/wireless/nl80211.c       |  8 ++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e71d4f690ef1..5cacf24cc9f0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -640,8 +640,9 @@ struct cfg80211_chan_def {
  * @noack: noack configuration value for the TID
  * @retry_long: retry count value
  * @retry_short: retry count value
- * @ampdu: Enable/Disable aggregation
+ * @ampdu: Enable/Disable MPDU aggregation
  * @rtscts: Enable/Disable RTS/CTS
+ * @amsdu: Enable/Disable MSDU aggregation
  */
 struct cfg80211_tid_cfg {
 	bool config_override;
@@ -651,6 +652,7 @@ struct cfg80211_tid_cfg {
 	u8 retry_long, retry_short;
 	enum nl80211_tid_config ampdu;
 	enum nl80211_tid_config rtscts;
+	enum nl80211_tid_config amsdu;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9679d561f7d0..1ccb0bf657ec 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4844,12 +4844,15 @@ enum nl80211_tid_config {
  *	&NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and
  *	the max value is advertised by the driver in this attribute on
  *	output in wiphy capabilities.
- * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs
- *	specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using
- *	the values from &nl80211_tid_config.
+ * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable MPDU aggregation
+ *	for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS.
+ *	Its type is u8, using the values from &nl80211_tid_config.
  * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs
  *	specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using
  *	the values from &nl80211_tid_config.
+ * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation
+ *	for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS.
+ *	Its type is u8, using the values from &nl80211_tid_config.
  */
 enum nl80211_tid_config_attr {
 	__NL80211_TID_CONFIG_ATTR_INVALID,
@@ -4863,6 +4866,7 @@ enum nl80211_tid_config_attr {
 	NL80211_TID_CONFIG_ATTR_RETRY_LONG,
 	NL80211_TID_CONFIG_ATTR_AMPDU_CTRL,
 	NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL,
+	NL80211_TID_CONFIG_ATTR_AMSDU_CTRL,
 
 	/* keep last */
 	__NL80211_TID_CONFIG_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fa66d5b6f557..482a80b78844 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -343,6 +343,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
 			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
 	[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] =
 			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
+	[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] =
+			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
 };
 
 static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
@@ -14080,6 +14082,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]);
 	}
 
+	if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) {
+		tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL);
+		tid_conf->amsdu =
+			nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]);
+	}
+
 	if (peer)
 		mask = rdev->wiphy.tid_config_support.peer;
 	else
-- 
cgit v1.2.3


From c03369558c435f7e82f7c06b0173fa73c1ed15c0 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Fri, 24 Apr 2020 14:29:04 +0300
Subject: nl80211: simplify peer specific TID configuration

Current rule for applying TID configuration for specific peer looks overly
complicated. No need to reject new TID configuration when override flag is
specified. Another call with the same TID configuration, but without
override flag, allows to apply new configuration anyway.

Use the same approach as for the 'all peers' case: if override flag is
specified, then reset existing TID configuration and immediately
apply a new one.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200424112905.26770-5-sergey.matyukevich.os@quantenna.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 10 ++++------
 net/wireless/nl80211.c       |  5 +----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1ccb0bf657ec..d1b1d9e49887 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4823,12 +4823,10 @@ enum nl80211_tid_config {
  *	(%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE).
  * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but
  *	per peer instead.
- * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer
- *	is selected, if set indicates that the new configuration overrides
- *	all previous peer configurations, otherwise previous peer specific
- *	configurations should be left untouched. If peer is selected then
- *	it will reset particular TID configuration of that peer and it will
- *	not accept other TID config attributes along with peer.
+ * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates
+ *	that the new configuration overrides all previous peer
+ *	configurations, otherwise previous peer specific configurations
+ *	should be left untouched.
  * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7)
  *	Its type is u16.
  * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 482a80b78844..258c621f651c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14036,10 +14036,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 		if (rdev->ops->reset_tid_config) {
 			err = rdev_reset_tid_config(rdev, dev, peer,
 						    tid_conf->tids);
-			/* If peer is there no other configuration will be
-			 * allowed
-			 */
-			if (err || peer)
+			if (err)
 				return err;
 		} else {
 			return -EINVAL;
-- 
cgit v1.2.3


From e76fede8bf7c90d92c799d9ceb092dec48346e2c Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Thu, 30 Apr 2020 10:25:50 -0700
Subject: cfg80211: add KHz variants of frame RX API

Drivers may wish to report the RX frequency in units of
KHz. Provide cfg80211_rx_mgmt_khz() and wrap it with
cfg80211_rx_mgmt() so exisiting drivers which can't report
KHz anyway don't need to change. Add a similar wrapper for
cfg80211_report_obss_beacon() so the frequency units stay
somewhat consistent.

This doesn't actually change the nl80211 API yet.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200430172554.18383-2-thomas@adapt-ip.com
[fix mac80211 calling the non-khz version of obss beacon report,
 drop trace point name changes]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  2 ++
 include/net/cfg80211.h    | 54 ++++++++++++++++++++++++++++++++++++++++++-----
 net/mac80211/rx.c         | 12 ++++++-----
 net/wireless/mlme.c       |  6 +++---
 net/wireless/nl80211.c    | 12 +++++------
 net/wireless/trace.h      |  8 +++----
 6 files changed, 71 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a561db435a4b..41d5f000c0d9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3333,6 +3333,8 @@ static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
 /* convert frequencies */
 #define MHZ_TO_KHZ(freq) ((freq) * 1000)
 #define KHZ_TO_MHZ(freq) ((freq) / 1000)
+#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000
+#define KHZ_F "%d.%03d"
 
 /* convert powers */
 #define DBI_TO_MBI(gain) ((gain) * 100)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5cacf24cc9f0..7415f77d99ca 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6988,6 +6988,26 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 			  enum nl80211_connect_failed_reason reason,
 			  gfp_t gfp);
 
+/**
+ * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame
+ * @wdev: wireless device receiving the frame
+ * @freq: Frequency on which the frame was received in KHz
+ * @sig_dbm: signal strength in dBm, or 0 if unknown
+ * @buf: Management frame (header + body)
+ * @len: length of the frame data
+ * @flags: flags, as defined in enum nl80211_rxmgmt_flags
+ *
+ * This function is called whenever an Action frame is received for a station
+ * mode interface, but is not processed in kernel.
+ *
+ * Return: %true if a user space application has registered for this frame.
+ * For action frames, that makes it responsible for rejecting unrecognized
+ * action frames; %false otherwise, in which case for action frames the
+ * driver is responsible for rejecting the frame.
+ */
+bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
+			  const u8 *buf, size_t len, u32 flags);
+
 /**
  * cfg80211_rx_mgmt - notification of received, unprocessed management frame
  * @wdev: wireless device receiving the frame
@@ -7005,8 +7025,13 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * action frames; %false otherwise, in which case for action frames the
  * driver is responsible for rejecting the frame.
  */
-bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
-		      const u8 *buf, size_t len, u32 flags);
+static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
+				    int sig_dbm, const u8 *buf, size_t len,
+				    u32 flags)
+{
+	return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len,
+				    flags);
+}
 
 /**
  * cfg80211_mgmt_tx_status - notification of TX status for management frame
@@ -7204,6 +7229,21 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 			   u64 cookie, bool acked, s32 ack_signal,
 			   bool is_valid_ack_signal, gfp_t gfp);
 
+/**
+ * cfg80211_report_obss_beacon_khz - report beacon from other APs
+ * @wiphy: The wiphy that received the beacon
+ * @frame: the frame
+ * @len: length of the frame
+ * @freq: frequency the frame was received on in KHz
+ * @sig_dbm: signal strength in dBm, or 0 if unknown
+ *
+ * Use this function to report to userspace when a beacon was
+ * received. It is not useful to call this when there is no
+ * netdev that is in AP/GO mode.
+ */
+void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
+				     size_t len, int freq, int sig_dbm);
+
 /**
  * cfg80211_report_obss_beacon - report beacon from other APs
  * @wiphy: The wiphy that received the beacon
@@ -7216,9 +7256,13 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
  * received. It is not useful to call this when there is no
  * netdev that is in AP/GO mode.
  */
-void cfg80211_report_obss_beacon(struct wiphy *wiphy,
-				 const u8 *frame, size_t len,
-				 int freq, int sig_dbm);
+static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy,
+					       const u8 *frame, size_t len,
+					       int freq, int sig_dbm)
+{
+	cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq),
+					sig_dbm);
+}
 
 /**
  * cfg80211_reg_can_beacon - check if beaconing is allowed
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index eaf8931e4627..8e47b0d31051 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3095,9 +3095,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
 		    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
 			sig = status->signal;
 
-		cfg80211_report_obss_beacon(rx->local->hw.wiphy,
-					    rx->skb->data, rx->skb->len,
-					    status->freq, sig);
+		cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy,
+						rx->skb->data, rx->skb->len,
+						ieee80211_rx_status_to_khz(status),
+						sig);
 		rx->flags |= IEEE80211_RX_BEACON_REPORTED;
 	}
 
@@ -3443,8 +3444,9 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 	    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
 		sig = status->signal;
 
-	if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
-			     rx->skb->data, rx->skb->len, 0)) {
+	if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev,
+				 ieee80211_rx_status_to_khz(status), sig,
+				 rx->skb->data, rx->skb->len, 0)) {
 		if (rx->sta)
 			rx->sta->rx_stats.packets++;
 		dev_kfree_skb(rx->skb);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 409497a3527d..189334314cba 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -729,8 +729,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
 }
 
-bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
-		      const u8 *buf, size_t len, u32 flags)
+bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm,
+			  const u8 *buf, size_t len, u32 flags)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -785,7 +785,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
 	trace_cfg80211_return_bool(result);
 	return result;
 }
-EXPORT_SYMBOL(cfg80211_rx_mgmt);
+EXPORT_SYMBOL(cfg80211_rx_mgmt_khz);
 
 void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev)
 {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 258c621f651c..f6523f1485a3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -16214,7 +16214,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 					netdev->ifindex)) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
-	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
+	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) ||
 	    (sig_dbm &&
 	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
 	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
@@ -16840,9 +16840,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 }
 EXPORT_SYMBOL(cfg80211_probe_status);
 
-void cfg80211_report_obss_beacon(struct wiphy *wiphy,
-				 const u8 *frame, size_t len,
-				 int freq, int sig_dbm)
+void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
+				     size_t len, int freq, int sig_dbm)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct sk_buff *msg;
@@ -16865,7 +16864,8 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy,
 
 		if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 		    (freq &&
-		     nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) ||
+		     nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
+				 KHZ_TO_MHZ(freq))) ||
 		    (sig_dbm &&
 		     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
 		    nla_put(msg, NL80211_ATTR_FRAME, len, frame))
@@ -16882,7 +16882,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy,
 	spin_unlock_bh(&rdev->beacon_registrations_lock);
 	nlmsg_free(msg);
 }
-EXPORT_SYMBOL(cfg80211_report_obss_beacon);
+EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz);
 
 #ifdef CONFIG_PM
 static int cfg80211_net_detect_results(struct sk_buff *msg,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 53c887ea67c7..f2ab44a2a3e4 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2840,8 +2840,8 @@ TRACE_EVENT(cfg80211_rx_mgmt,
 		__entry->freq = freq;
 		__entry->sig_dbm = sig_dbm;
 	),
-	TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d",
-		  WDEV_PR_ARG, __entry->freq, __entry->sig_dbm)
+	TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d",
+		  WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
 );
 
 TRACE_EVENT(cfg80211_mgmt_tx_status,
@@ -3121,8 +3121,8 @@ TRACE_EVENT(cfg80211_report_obss_beacon,
 		__entry->freq = freq;
 		__entry->sig_dbm = sig_dbm;
 	),
-	TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d",
-		  WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm)
+	TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d",
+		  WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
 );
 
 TRACE_EVENT(cfg80211_tdls_oper_request,
-- 
cgit v1.2.3


From 942ba88ba9c87f5e225574f1f0d6548f0105ed73 Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Thu, 30 Apr 2020 10:25:51 -0700
Subject: nl80211: add KHz frequency offset for most wifi commands

cfg80211 recently gained the ability to understand a
frequency offset component in KHz. Expose this in nl80211
through the new attributes NL80211_ATTR_WIPHY_FREQ_OFFSET,
NL80211_FREQUENCY_ATTR_OFFSET,
NL80211_ATTR_CENTER_FREQ1_OFFSET, and
NL80211_BSS_FREQUENCY_OFFSET.

These add support to send and receive a KHz offset
component with the following NL80211 commands:

- NL80211_CMD_FRAME
- NL80211_CMD_GET_SCAN
- NL80211_CMD_AUTHENTICATE
- NL80211_CMD_ASSOCIATE
- NL80211_CMD_CONNECT

Along with any other command which takes a chandef, ie:

- NL80211_CMD_SET_CHANNEL
- NL80211_CMD_SET_WIPHY
- NL80211_CMD_START_AP
- NL80211_CMD_RADAR_DETECT
- NL80211_CMD_NOTIFY_RADAR
- NL80211_CMD_CHANNEL_SWITCH
- NL80211_JOIN_IBSS
- NL80211_CMD_REMAIN_ON_CHANNEL
- NL80211_CMD_JOIN_OCB
- NL80211_CMD_JOIN_MESH
- NL80211_CMD_TDLS_CHANNEL_SWITCH

If the driver advertises a band containing channels with
frequency offset, it must also verify support for
frequency offset channels in its cfg80211 ops, or return
an error.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200430172554.18383-3-thomas@adapt-ip.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 50 ++++++++++++++++++----------
 net/wireless/nl80211.c       | 78 ++++++++++++++++++++++++++++++++------------
 2 files changed, 91 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d1b1d9e49887..b1cd132c1d27 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -296,13 +296,14 @@
  *	to get a list of all present wiphys.
  * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or
  *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME,
- *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ (and the
- *	attributes determining the channel width; this is used for setting
- *	monitor mode channel),  %NL80211_ATTR_WIPHY_RETRY_SHORT,
- *	%NL80211_ATTR_WIPHY_RETRY_LONG, %NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
- *	and/or %NL80211_ATTR_WIPHY_RTS_THRESHOLD.
- *	However, for setting the channel, see %NL80211_CMD_SET_CHANNEL
- *	instead, the support here is for backward compatibility only.
+ *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ,
+ *	%NL80211_ATTR_WIPHY_FREQ_OFFSET (and the attributes determining the
+ *	channel width; this is used for setting monitor mode channel),
+ *	%NL80211_ATTR_WIPHY_RETRY_SHORT, %NL80211_ATTR_WIPHY_RETRY_LONG,
+ *	%NL80211_ATTR_WIPHY_FRAG_THRESHOLD, and/or
+ *	%NL80211_ATTR_WIPHY_RTS_THRESHOLD.  However, for setting the channel,
+ *	see %NL80211_CMD_SET_CHANNEL instead, the support here is for backward
+ *	compatibility only.
  * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request
  *	or rename notification. Has attributes %NL80211_ATTR_WIPHY and
  *	%NL80211_ATTR_WIPHY_NAME.
@@ -351,7 +352,8 @@
  *	%NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT,
  *	%NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS.
  *	The channel to use can be set on the interface or be given using the
- *	%NL80211_ATTR_WIPHY_FREQ and the attributes determining channel width.
+ *	%NL80211_ATTR_WIPHY_FREQ and %NL80211_ATTR_WIPHY_FREQ_OFFSET, and the
+ *	attributes determining channel width.
  * @NL80211_CMD_NEW_BEACON: old alias for %NL80211_CMD_START_AP
  * @NL80211_CMD_STOP_AP: Stop AP operation on the given interface
  * @NL80211_CMD_DEL_BEACON: old alias for %NL80211_CMD_STOP_AP
@@ -536,11 +538,12 @@
  *	interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and
  *	BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify
  *	the SSID (mainly for association, but is included in authentication
- *	request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ is used
- *	to specify the frequence of the channel in MHz. %NL80211_ATTR_AUTH_TYPE
- *	is used to specify the authentication type. %NL80211_ATTR_IE is used to
- *	define IEs (VendorSpecificInfo, but also including RSN IE and FT IEs)
- *	to be added to the frame.
+ *	request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ +
+ *	%NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the
+ *	channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the
+ *	authentication type. %NL80211_ATTR_IE is used to define IEs
+ *	(VendorSpecificInfo, but also including RSN IE and FT IEs) to be added
+ *	to the frame.
  *	When used as an event, this reports reception of an Authentication
  *	frame in station and IBSS modes when the local MLME processed the
  *	frame, i.e., it was for the local STA and was received in correct
@@ -595,8 +598,9 @@
  *	requests to connect to a specified network but without separating
  *	auth and assoc steps. For this, you need to specify the SSID in a
  *	%NL80211_ATTR_SSID attribute, and can optionally specify the association
- *	IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP,
- *	%NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT,
+ *	IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE,
+ *	%NL80211_ATTR_USE_MFP, %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ,
+ *	%NL80211_ATTR_WIPHY_FREQ_OFFSET, %NL80211_ATTR_CONTROL_PORT,
  *	%NL80211_ATTR_CONTROL_PORT_ETHERTYPE,
  *	%NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT,
  *	%NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and
@@ -1433,7 +1437,8 @@ enum nl80211_commands {
  *	of &enum nl80211_chan_width, describing the channel width. See the
  *	documentation of the enum for more information.
  * @NL80211_ATTR_CENTER_FREQ1: Center frequency of the first part of the
- *	channel, used for anything but 20 MHz bandwidth
+ *	channel, used for anything but 20 MHz bandwidth. In S1G this is the
+ *	operating channel center frequency.
  * @NL80211_ATTR_CENTER_FREQ2: Center frequency of the second part of the
  *	channel, used only for 80+80 MHz bandwidth
  * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ
@@ -2480,9 +2485,14 @@ enum nl80211_commands {
  *	entry without having to force a disconnection after the PMK timeout. If
  *	no roaming occurs between the reauth threshold and PMK expiration,
  *	disassociation is still forced.
- *
  * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the
  *	%NL80211_CMD_REGISTER_FRAME command, see the description there.
+ * @NL80211_ATTR_WIPHY_FREQ_OFFSET: offset of the associated
+ *	%NL80211_ATTR_WIPHY_FREQ in positive KHz. Only valid when supplied with
+ *	an %NL80211_ATTR_WIPHY_FREQ_OFFSET.
+ * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the
+ *	first channel segment specified in %NL80211_ATTR_CENTER_FREQ1.
+ *
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2960,6 +2970,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_PMK_REAUTH_THRESHOLD,
 
 	NL80211_ATTR_RECEIVE_MULTICAST,
+	NL80211_ATTR_WIPHY_FREQ_OFFSET,
+	NL80211_ATTR_CENTER_FREQ1_OFFSET,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -3682,6 +3694,7 @@ enum nl80211_wmm_rule {
  *	(see &enum nl80211_wmm_rule)
  * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel
  *	in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -3712,6 +3725,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_10MHZ,
 	NL80211_FREQUENCY_ATTR_WMM,
 	NL80211_FREQUENCY_ATTR_NO_HE,
+	NL80211_FREQUENCY_ATTR_OFFSET,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4482,6 +4496,7 @@ enum nl80211_bss_scan_width {
  * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update.
  *	Contains a nested array of signal strength attributes (u8, dBm),
  *	using the nesting index as the antenna number.
+ * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -4506,6 +4521,7 @@ enum nl80211_bss {
 	NL80211_BSS_PARENT_TSF,
 	NL80211_BSS_PARENT_BSSID,
 	NL80211_BSS_CHAIN_SIGNAL,
+	NL80211_BSS_FREQUENCY_OFFSET,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f6523f1485a3..87d7efd186d0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -365,6 +365,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 
 	[NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 },
 	[NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
+	[NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
 	[NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
 
 	[NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
@@ -638,6 +639,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1),
 	[NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100),
 	[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
+	[NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
 };
 
 /* policy for the key attributes */
@@ -904,6 +906,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 			chan->center_freq))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset))
+		goto nla_put_failure;
+
 	if ((chan->flags & IEEE80211_CHAN_DISABLED) &&
 	    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED))
 		goto nla_put_failure;
@@ -1309,13 +1314,11 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 }
 
 static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy,
-							struct nlattr *tb)
+							u32 freq)
 {
 	struct ieee80211_channel *chan;
 
-	if (tb == NULL)
-		return NULL;
-	chan = ieee80211_get_channel(wiphy, nla_get_u32(tb));
+	chan = ieee80211_get_channel_khz(wiphy, freq);
 	if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
 		return NULL;
 	return chan;
@@ -2770,13 +2773,17 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 	if (!attrs[NL80211_ATTR_WIPHY_FREQ])
 		return -EINVAL;
 
-	control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]);
+	control_freq = MHZ_TO_KHZ(
+			nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
+		control_freq +=
+		    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);
 
 	memset(chandef, 0, sizeof(*chandef));
-
-	chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq);
+	chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq);
 	chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
-	chandef->center_freq1 = control_freq;
+	chandef->center_freq1 = KHZ_TO_MHZ(control_freq);
+	chandef->freq1_offset = control_freq % 1000;
 	chandef->center_freq2 = 0;
 
 	/* Primary channel not allowed */
@@ -2824,9 +2831,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 	} else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
 		chandef->width =
 			nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
-		if (attrs[NL80211_ATTR_CENTER_FREQ1])
+		if (attrs[NL80211_ATTR_CENTER_FREQ1]) {
 			chandef->center_freq1 =
 				nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]);
+			if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET])
+				chandef->freq1_offset = nla_get_u32(
+				      attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]);
+			else
+				chandef->freq1_offset = 0;
+		}
 		if (attrs[NL80211_ATTR_CENTER_FREQ2])
 			chandef->center_freq2 =
 				nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
@@ -3259,6 +3272,9 @@ static int nl80211_send_chandef(struct sk_buff *msg,
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
 			chandef->chan->center_freq))
 		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET,
+			chandef->chan->freq_offset))
+		return -ENOBUFS;
 	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_20_NOHT:
 	case NL80211_CHAN_WIDTH_20:
@@ -8873,6 +8889,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 		goto nla_put_failure;
 	if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) ||
 	    nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
+	    nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET,
+			res->channel->freq_offset) ||
 	    nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) ||
 	    nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
 			jiffies_to_msecs(jiffies - intbss->ts)))
@@ -9141,6 +9159,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	enum nl80211_auth_type auth_type;
 	struct key_parse key;
 	bool local_state_change;
+	u32 freq;
 
 	if (!info->attrs[NL80211_ATTR_MAC])
 		return -EINVAL;
@@ -9197,8 +9216,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 		return -EOPNOTSUPP;
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	chan = nl80211_get_valid_chan(&rdev->wiphy,
-				      info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+	freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
+		freq +=
+		    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);
+
+	chan = nl80211_get_valid_chan(&rdev->wiphy, freq);
 	if (!chan)
 		return -EINVAL;
 
@@ -9388,6 +9411,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_assoc_request req = {};
 	const u8 *bssid, *ssid;
 	int err, ssid_len = 0;
+	u32 freq;
 
 	if (dev->ieee80211_ptr->conn_owner_nlportid &&
 	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
@@ -9407,8 +9431,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	chan = nl80211_get_valid_chan(&rdev->wiphy,
-				      info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+	freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
+		freq +=
+		    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);
+	chan = nl80211_get_valid_chan(&rdev->wiphy, freq);
 	if (!chan)
 		return -EINVAL;
 
@@ -10088,6 +10115,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_connect_params connect;
 	struct wiphy *wiphy;
 	struct cfg80211_cached_keys *connkeys = NULL;
+	u32 freq = 0;
 	int err;
 
 	memset(&connect, 0, sizeof(connect));
@@ -10158,14 +10186,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.prev_bssid =
 			nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
 
-	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
-		connect.channel = nl80211_get_valid_chan(
-			wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ])
+		freq = MHZ_TO_KHZ(nla_get_u32(
+					info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
+		freq +=
+		    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);
+
+	if (freq) {
+		connect.channel = nl80211_get_valid_chan(wiphy, freq);
 		if (!connect.channel)
 			return -EINVAL;
 	} else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) {
-		connect.channel_hint = nl80211_get_valid_chan(
-			wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]);
+		freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]);
+		freq = MHZ_TO_KHZ(freq);
+		connect.channel_hint = nl80211_get_valid_chan(wiphy, freq);
 		if (!connect.channel_hint)
 			return -EINVAL;
 	}
@@ -16215,6 +16250,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
 	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) ||
+	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) ||
 	    (sig_dbm &&
 	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
 	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
@@ -16864,8 +16900,10 @@ void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
 
 		if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 		    (freq &&
-		     nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
-				 KHZ_TO_MHZ(freq))) ||
+		     (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
+				  KHZ_TO_MHZ(freq)) ||
+		      nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET,
+				  freq % 1000))) ||
 		    (sig_dbm &&
 		     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
 		    nla_put(msg, NL80211_ATTR_FRAME, len, frame))
-- 
cgit v1.2.3


From 2032f3b2f943256ff40df23182913dfc7e73ec6a Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Thu, 30 Apr 2020 10:25:52 -0700
Subject: nl80211: support scan frequencies in KHz

If the driver advertises NL80211_EXT_FEATURE_SCAN_FREQ_KHZ
userspace can omit NL80211_ATTR_SCAN_FREQUENCIES in favor
of an NL80211_ATTR_SCAN_FREQ_KHZ. To get scan results in
KHz userspace must also set the
NL80211_SCAN_FLAG_FREQ_KHZ.

This lets nl80211 remain compatible with older userspaces
while not requring and sending redundant (and potentially
incorrect) scan frequency sets.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200430172554.18383-4-thomas@adapt-ip.com
[use just nla_nest_start() (not _noflag) for NL80211_ATTR_SCAN_FREQ_KHZ]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 13 ++++++++++-
 net/mac80211/main.c          |  2 ++
 net/wireless/nl80211.c       | 51 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b1cd132c1d27..47d39b6a073d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2492,7 +2492,7 @@ enum nl80211_commands {
  *	an %NL80211_ATTR_WIPHY_FREQ_OFFSET.
  * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the
  *	first channel segment specified in %NL80211_ATTR_CENTER_FREQ1.
- *
+ * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2972,6 +2972,7 @@ enum nl80211_attrs {
 	NL80211_ATTR_RECEIVE_MULTICAST,
 	NL80211_ATTR_WIPHY_FREQ_OFFSET,
 	NL80211_ATTR_CENTER_FREQ1_OFFSET,
+	NL80211_ATTR_SCAN_FREQ_KHZ,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -5723,6 +5724,11 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations
  *	are possible for multicast frames and those will be reported properly.
  *
+ * @NL80211_EXT_FEATURE_SCAN_FREQ_KHZ: This driver supports receiving and
+ *	reporting scan request with %NL80211_ATTR_SCAN_FREQ_KHZ. In order to
+ *	report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be
+ *	included in the scan request.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5776,6 +5782,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_DEL_IBSS_STA,
 	NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS,
 	NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT,
+	NL80211_EXT_FEATURE_SCAN_FREQ_KHZ,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5887,6 +5894,9 @@ enum nl80211_timeout_reason {
  * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to
  *	only have supported rates and no additional capabilities (unless
  *	added by userspace explicitly.)
+ * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with
+ *	%NL80211_ATTR_SCAN_FREQ_KHZ. This also means
+ *	%NL80211_ATTR_SCAN_FREQUENCIES will not be included.
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
@@ -5902,6 +5912,7 @@ enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_HIGH_ACCURACY				= 1<<10,
 	NL80211_SCAN_FLAG_RANDOM_SN				= 1<<11,
 	NL80211_SCAN_FLAG_MIN_PREQ_CONTENT			= 1<<12,
+	NL80211_SCAN_FLAG_FREQ_KHZ				= 1<<13,
 };
 
 /**
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 06c90d360633..ac74bd780b42 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 			      NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
 	wiphy_ext_feature_set(wiphy,
 			      NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH);
+	wiphy_ext_feature_set(wiphy,
+			      NL80211_EXT_FEATURE_SCAN_FREQ_KHZ);
 
 	if (!ops->hw_scan) {
 		wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 87d7efd186d0..84bfa147769a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -640,6 +640,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100),
 	[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
 	[NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
+	[NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED },
 };
 
 /* policy for the key attributes */
@@ -7719,6 +7720,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct wireless_dev *wdev = info->user_ptr[1];
 	struct cfg80211_scan_request *request;
+	struct nlattr *scan_freqs = NULL;
+	bool scan_freqs_khz = false;
 	struct nlattr *attr;
 	struct wiphy *wiphy;
 	int err, tmp, n_ssids = 0, n_channels, i;
@@ -7737,9 +7740,17 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
-		n_channels = validate_scan_freqs(
-				info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
+	if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) {
+		if (!wiphy_ext_feature_isset(wiphy,
+					     NL80211_EXT_FEATURE_SCAN_FREQ_KHZ))
+			return -EOPNOTSUPP;
+		scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ];
+		scan_freqs_khz = true;
+	} else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES])
+		scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES];
+
+	if (scan_freqs) {
+		n_channels = validate_scan_freqs(scan_freqs);
 		if (!n_channels) {
 			err = -EINVAL;
 			goto unlock;
@@ -7787,13 +7798,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	i = 0;
-	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+	if (scan_freqs) {
 		/* user specified, bail out if channel not found */
-		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) {
+		nla_for_each_nested(attr, scan_freqs, tmp) {
 			struct ieee80211_channel *chan;
+			int freq = nla_get_u32(attr);
 
-			chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+			if (!scan_freqs_khz)
+				freq = MHZ_TO_KHZ(freq);
 
+			chan = ieee80211_get_channel_khz(wiphy, freq);
 			if (!chan) {
 				err = -EINVAL;
 				goto out_free;
@@ -15231,14 +15245,27 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 	}
 	nla_nest_end(msg, nest);
 
-	nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES);
-	if (!nest)
-		goto nla_put_failure;
-	for (i = 0; i < req->n_channels; i++) {
-		if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+	if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) {
+		nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ);
+		if (!nest)
+			goto nla_put_failure;
+		for (i = 0; i < req->n_channels; i++) {
+			if (nla_put_u32(msg, i,
+				   ieee80211_channel_to_khz(req->channels[i])))
+				goto nla_put_failure;
+		}
+		nla_nest_end(msg, nest);
+	} else {
+		nest = nla_nest_start_noflag(msg,
+					     NL80211_ATTR_SCAN_FREQUENCIES);
+		if (!nest)
 			goto nla_put_failure;
+		for (i = 0; i < req->n_channels; i++) {
+			if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+				goto nla_put_failure;
+		}
+		nla_nest_end(msg, nest);
 	}
-	nla_nest_end(msg, nest);
 
 	if (req->ie &&
 	    nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie))
-- 
cgit v1.2.3


From 396fba0a59f3c94d6fd6443fbeabd8bd9e3956eb Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 7 May 2020 13:39:09 -0500
Subject: cfg80211: Replace zero-length array with flexible-array

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

sizeof(flexible-array-member) triggers a warning because flexible array
members have incomplete type[1]. There are some instances of code in
which the sizeof operator is being incorrectly/erroneously applied to
zero-length arrays and the result is zero. Such instances may be hiding
some bugs. So, this work (flexible-array member conversions) will also
help to get completely rid of those sorts of issues.

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20200507183909.GA12993@embeddedor
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 6 +++---
 include/net/cfg80211.h    | 8 ++++----
 net/wireless/core.h       | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2153d465d752..0320ca4c7d28 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -800,7 +800,7 @@ struct ieee80211_msrment_ie {
 	u8 token;
 	u8 mode;
 	u8 type;
-	u8 request[0];
+	u8 request[];
 } __packed;
 
 /**
@@ -1781,7 +1781,7 @@ struct ieee80211_he_operation {
 	__le32 he_oper_params;
 	__le16 he_mcs_nss_set;
 	/* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */
-	u8 optional[0];
+	u8 optional[];
 } __packed;
 
 /**
@@ -1793,7 +1793,7 @@ struct ieee80211_he_operation {
 struct ieee80211_he_spr {
 	u8 he_sr_control;
 	/* Optional 0 to 19 bytes: depends on @he_sr_control */
-	u8 optional[0];
+	u8 optional[];
 } __packed;
 
 /**
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7415f77d99ca..021366cfb2b0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2037,7 +2037,7 @@ struct cfg80211_scan_request {
 	bool no_cck;
 
 	/* keep last */
-	struct ieee80211_channel *channels[0];
+	struct ieee80211_channel *channels[];
 };
 
 static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
@@ -2183,7 +2183,7 @@ struct cfg80211_sched_scan_request {
 	struct list_head list;
 
 	/* keep last */
-	struct ieee80211_channel *channels[0];
+	struct ieee80211_channel *channels[];
 };
 
 /**
@@ -2305,7 +2305,7 @@ struct cfg80211_bss {
 	u8 bssid_index;
 	u8 max_bssid_indicator;
 
-	u8 priv[0] __aligned(sizeof(void *));
+	u8 priv[] __aligned(sizeof(void *));
 };
 
 /**
@@ -4852,7 +4852,7 @@ struct wiphy {
 
 	u8 max_data_retry_count;
 
-	char priv[0] __aligned(NETDEV_ALIGN);
+	char priv[] __aligned(NETDEV_ALIGN);
 };
 
 static inline struct net *wiphy_net(struct wiphy *wiphy)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 639d41896573..e0e5b3ee9699 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -286,7 +286,7 @@ struct cfg80211_cqm_config {
 	u32 rssi_hyst;
 	s32 last_rssi_event_value;
 	int n_rssi_thresholds;
-	s32 rssi_thresholds[0];
+	s32 rssi_thresholds[];
 };
 
 void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev);
-- 
cgit v1.2.3


From 3c23215ba8c70c0e9b16beffb7f700a401391e38 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 7 May 2020 13:59:07 -0500
Subject: mac80211: Replace zero-length array with flexible-array

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

sizeof(flexible-array-member) triggers a warning because flexible array
members have incomplete type[1]. There are some instances of code in
which the sizeof operator is being incorrectly/erroneously applied to
zero-length arrays and the result is zero. Such instances may be hiding
some bugs. So, this work (flexible-array member conversions) will also
help to get completely rid of those sorts of issues.

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20200507185907.GA15102@embeddedor
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 10 +++++-----
 net/mac80211/ieee80211_i.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0d48e679efb0..7cb712427df1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -230,7 +230,7 @@ struct ieee80211_chanctx_conf {
 
 	bool radar_enabled;
 
-	u8 drv_priv[0] __aligned(sizeof(void *));
+	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
 /**
@@ -1670,7 +1670,7 @@ struct ieee80211_vif {
 	bool txqs_stopped[IEEE80211_NUM_ACS];
 
 	/* must be last */
-	u8 drv_priv[0] __aligned(sizeof(void *));
+	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
 static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif)
@@ -1798,7 +1798,7 @@ struct ieee80211_key_conf {
 	s8 keyidx;
 	u16 flags;
 	u8 keylen;
-	u8 key[0];
+	u8 key[];
 };
 
 #define IEEE80211_MAX_PN_LEN	16
@@ -2053,7 +2053,7 @@ struct ieee80211_sta {
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
 	/* must be last */
-	u8 drv_priv[0] __aligned(sizeof(void *));
+	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
 /**
@@ -2099,7 +2099,7 @@ struct ieee80211_txq {
 	u8 ac;
 
 	/* must be last */
-	u8 drv_priv[0] __aligned(sizeof(void *));
+	u8 drv_priv[] __aligned(sizeof(void *));
 };
 
 /**
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8cbae66b5cdb..2d1b6cb75497 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -267,7 +267,7 @@ struct probe_resp {
 	struct rcu_head rcu_head;
 	int len;
 	u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
-	u8 data[0];
+	u8 data[];
 };
 
 struct ps_data {
-- 
cgit v1.2.3


From dca9ca2d588bd2c0989c671f048540b82e57cf1e Mon Sep 17 00:00:00 2001
From: Markus Theil <markus.theil@tu-ilmenau.de>
Date: Fri, 8 May 2020 16:42:00 +0200
Subject: nl80211: add ability to report TX status for control port TX

This adds the necessary capabilities in nl80211 to allow drivers to
assign a cookie to control port TX frames (returned via extack in
the netlink ACK message of the command) and then later report the
frame's status.

Signed-off-by: Markus Theil <markus.theil@tu-ilmenau.de>
Link: https://lore.kernel.org/r/20200508144202.7678-2-markus.theil@tu-ilmenau.de
[use extack cookie instead of explicit message, recombine patches]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 20 +++++++++++++++++++-
 include/uapi/linux/nl80211.h | 12 ++++++++++++
 net/mac80211/ieee80211_i.h   |  3 ++-
 net/mac80211/tx.c            |  3 ++-
 net/wireless/nl80211.c       | 41 +++++++++++++++++++++++++++++++++--------
 net/wireless/rdev-ops.h      |  9 ++++++---
 net/wireless/trace.h         | 17 +++++++++++++++++
 7 files changed, 91 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 021366cfb2b0..f842f3652026 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4069,7 +4069,8 @@ struct cfg80211_ops {
 				   struct net_device *dev,
 				   const u8 *buf, size_t len,
 				   const u8 *dest, const __be16 proto,
-				   const bool noencrypt);
+				   const bool noencrypt,
+				   u64 *cookie);
 
 	int	(*get_ftm_responder_stats)(struct wiphy *wiphy,
 				struct net_device *dev,
@@ -7049,6 +7050,23 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
 void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 			     const u8 *buf, size_t len, bool ack, gfp_t gfp);
 
+/**
+ * cfg80211_control_port_tx_status - notification of TX status for control
+ *                                   port frames
+ * @wdev: wireless device receiving the frame
+ * @cookie: Cookie returned by cfg80211_ops::tx_control_port()
+ * @buf: Data frame (header + body)
+ * @len: length of the frame data
+ * @ack: Whether frame was acknowledged
+ * @gfp: context flags
+ *
+ * This function is called whenever a control port frame was requested to be
+ * transmitted with cfg80211_ops::tx_control_port() to report the TX status of
+ * the transmission attempt.
+ */
+void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
+				     const u8 *buf, size_t len, bool ack,
+				     gfp_t gfp);
 
 /**
  * cfg80211_rx_control_port - notification about a received control port frame
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 47d39b6a073d..0f324b6b81cc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1164,6 +1164,12 @@
  *	dropped because it did not include a valid MME MIC while beacon
  *	protection was enabled (BIGTK configured in station mode).
  *
+ * @NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS: Report TX status of a control
+ *	port frame transmitted with %NL80211_CMD_CONTROL_PORT_FRAME.
+ *	%NL80211_ATTR_COOKIE identifies the TX command and %NL80211_ATTR_FRAME
+ *	includes the contents of the frame. %NL80211_ATTR_ACK flag is included
+ *	if the recipient acknowledged the frame.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1392,6 +1398,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_UNPROT_BEACON,
 
+	NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -5729,6 +5737,9 @@ enum nl80211_feature_flags {
  *	report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be
  *	included in the scan request.
  *
+ * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver
+ *	can report tx status for control port over nl80211 tx operations.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5783,6 +5794,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS,
 	NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT,
 	NL80211_EXT_FEATURE_SCAN_FREQ_KHZ,
+	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2d1b6cb75497..b87dc873825b 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1800,7 +1800,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata);
 void ieee80211_clear_fast_xmit(struct sta_info *sta);
 int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len,
-			      const u8 *dest, __be16 proto, bool unencrypted);
+			      const u8 *dest, __be16 proto, bool unencrypted,
+			      u64 *cookie);
 int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 47f460c8bd74..5931128e1855 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5339,7 +5339,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 
 int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 			      const u8 *buf, size_t len,
-			      const u8 *dest, __be16 proto, bool unencrypted)
+			      const u8 *dest, __be16 proto, bool unencrypted,
+			      u64 *cookie)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 84bfa147769a..7ea764865546 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13866,6 +13866,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
 
 static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 {
+	bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK];
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -13874,6 +13875,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	u8 *dest;
 	u16 proto;
 	bool noencrypt;
+	u64 cookie = 0;
 	int err;
 
 	if (!wiphy_ext_feature_isset(&rdev->wiphy,
@@ -13918,9 +13920,12 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	noencrypt =
 		nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]);
 
-	return rdev_tx_control_port(rdev, dev, buf, len,
-				    dest, cpu_to_be16(proto), noencrypt);
-
+	err = rdev_tx_control_port(rdev, dev, buf, len,
+				   dest, cpu_to_be16(proto), noencrypt,
+				   dont_wait_for_ack ? NULL : &cookie);
+	if (!err && !dont_wait_for_ack)
+		nl_set_extack_cookie_u64(info->extack, cookie);
+	return err;
  out:
 	wdev_unlock(wdev);
 	return err;
@@ -16294,8 +16299,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	return -ENOBUFS;
 }
 
-void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
-			     const u8 *buf, size_t len, bool ack, gfp_t gfp)
+static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie,
+				    const u8 *buf, size_t len, bool ack,
+				    gfp_t gfp, enum nl80211_commands command)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -16303,13 +16309,16 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 	struct sk_buff *msg;
 	void *hdr;
 
-	trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
+	if (command == NL80211_CMD_FRAME_TX_STATUS)
+		trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
+	else
+		trace_cfg80211_control_port_tx_status(wdev, cookie, ack);
 
 	msg = nlmsg_new(100 + len, gfp);
 	if (!msg)
 		return;
 
-	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
+	hdr = nl80211hdr_put(msg, 0, 0, 0, command);
 	if (!hdr) {
 		nlmsg_free(msg);
 		return;
@@ -16332,9 +16341,25 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 				NL80211_MCGRP_MLME, gfp);
 	return;
 
- nla_put_failure:
+nla_put_failure:
 	nlmsg_free(msg);
 }
+
+void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
+				     const u8 *buf, size_t len, bool ack,
+				     gfp_t gfp)
+{
+	nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp,
+				NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS);
+}
+EXPORT_SYMBOL(cfg80211_control_port_tx_status);
+
+void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
+			     const u8 *buf, size_t len, bool ack, gfp_t gfp)
+{
+	nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp,
+				NL80211_CMD_FRAME_TX_STATUS);
+}
 EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
 
 static int __nl80211_rx_control_port(struct net_device *dev,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index df5142e86c4f..950d57494168 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -748,14 +748,17 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
 				       struct net_device *dev,
 				       const void *buf, size_t len,
 				       const u8 *dest, __be16 proto,
-				       const bool noencrypt)
+				       const bool noencrypt, u64 *cookie)
 {
 	int ret;
 	trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
 				   dest, proto, noencrypt);
 	ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
-					 dest, proto, noencrypt);
-	trace_rdev_return_int(&rdev->wiphy, ret);
+					 dest, proto, noencrypt, cookie);
+	if (cookie)
+		trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
+	else
+		trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f2ab44a2a3e4..b23cab016521 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2861,6 +2861,23 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 		  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
 );
 
+TRACE_EVENT(cfg80211_control_port_tx_status,
+	TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
+	TP_ARGS(wdev, cookie, ack),
+	TP_STRUCT__entry(
+		WDEV_ENTRY
+		__field(u64, cookie)
+		__field(bool, ack)
+	),
+	TP_fast_assign(
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+		__entry->ack = ack;
+	),
+	TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
+		  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
+);
+
 TRACE_EVENT(cfg80211_rx_control_port,
 	TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
 		 bool unencrypted),
-- 
cgit v1.2.3


From 1ea02224afc29431880a67b8c3198146cc01d33e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 May 2020 10:31:33 +0200
Subject: mac80211: allow SA-QUERY processing in userspace

As discussed with Mathy almost two years ago in
http://lore.kernel.org/r/20180806224857.14853-1-Mathy.Vanhoef@cs.kuleuven.be
we should let userspace process SA-QUERY frames if it
wants to, so that it can handle OCV (operating channel
validation) which mac80211 doesn't know how to.

Evidently I had been expecting Mathy to (re)send such a
patch, but he never did, perhaps expecting me to do it
after our discussion.

In any case, this came up now with OCV getting more
attention, so move the code around as discussed there
to let userspace handle it, and do it properly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200526103131.1f9cf7e5b6db.Iae5b42b09ad2b1cbcbe13492002c43f0d1d51dfc@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 8e47b0d31051..8b0fa5e345f4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3354,19 +3354,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			}
 		}
 		break;
-	case WLAN_CATEGORY_SA_QUERY:
-		if (len < (IEEE80211_MIN_ACTION_SIZE +
-			   sizeof(mgmt->u.action.u.sa_query)))
-			break;
-
-		switch (mgmt->u.action.u.sa_query.action) {
-		case WLAN_ACTION_SA_QUERY_REQUEST:
-			if (sdata->vif.type != NL80211_IFTYPE_STATION)
-				break;
-			ieee80211_process_sa_query_req(sdata, mgmt, len);
-			goto handled;
-		}
-		break;
 	case WLAN_CATEGORY_SELF_PROTECTED:
 		if (len < (IEEE80211_MIN_ACTION_SIZE +
 			   sizeof(mgmt->u.action.u.self_prot.action_code)))
@@ -3456,6 +3443,41 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	int len = rx->skb->len;
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return RX_CONTINUE;
+
+	switch (mgmt->u.action.category) {
+	case WLAN_CATEGORY_SA_QUERY:
+		if (len < (IEEE80211_MIN_ACTION_SIZE +
+			   sizeof(mgmt->u.action.u.sa_query)))
+			break;
+
+		switch (mgmt->u.action.u.sa_query.action) {
+		case WLAN_ACTION_SA_QUERY_REQUEST:
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				break;
+			ieee80211_process_sa_query_req(sdata, mgmt, len);
+			goto handled;
+		}
+		break;
+	}
+
+	return RX_CONTINUE;
+
+ handled:
+	if (rx->sta)
+		rx->sta->rx_stats.packets++;
+	dev_kfree_skb(rx->skb);
+	return RX_QUEUED;
+}
+
 static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
 {
@@ -3736,6 +3758,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
 		CALL_RXH(ieee80211_rx_h_mgmt_check);
 		CALL_RXH(ieee80211_rx_h_action);
 		CALL_RXH(ieee80211_rx_h_userspace_mgmt);
+		CALL_RXH(ieee80211_rx_h_action_post_userspace);
 		CALL_RXH(ieee80211_rx_h_action_return);
 		CALL_RXH(ieee80211_rx_h_mgmt);
 
-- 
cgit v1.2.3


From 9a5f6488623730dc16cca0836ade23869761adee Mon Sep 17 00:00:00 2001
From: Tamizh Chelvam <tamizhr@codeaurora.org>
Date: Wed, 13 May 2020 13:41:44 +0530
Subject: nl80211: Add support to configure TID specific Tx rate configuration

This patch adds support to configure per TID Tx Rate configuration
through NL80211_TID_CONFIG_ATTR_TX_RATE* attributes. And it uses
nl80211_parse_tx_bitrate_mask api to validate the Tx rate mask.

Signed-off-by: Tamizh Chelvam <tamizhr@codeaurora.org>
Link: https://lore.kernel.org/r/1589357504-10175-1-git-send-email-tamizhr@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 29 +++++++++++++----------
 include/uapi/linux/nl80211.h | 21 +++++++++++++++++
 net/wireless/nl80211.c       | 56 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 80 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f842f3652026..e2dbc9c02ef3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -630,6 +630,19 @@ struct cfg80211_chan_def {
 	u16 freq1_offset;
 };
 
+/*
+ * cfg80211_bitrate_mask - masks for bitrate control
+ */
+struct cfg80211_bitrate_mask {
+	struct {
+		u32 legacy;
+		u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
+		u16 vht_mcs[NL80211_VHT_NSS_MAX];
+		enum nl80211_txrate_gi gi;
+	} control[NUM_NL80211_BANDS];
+};
+
+
 /**
  * struct cfg80211_tid_cfg - TID specific configuration
  * @config_override: Flag to notify driver to reset TID configuration
@@ -643,6 +656,8 @@ struct cfg80211_chan_def {
  * @ampdu: Enable/Disable MPDU aggregation
  * @rtscts: Enable/Disable RTS/CTS
  * @amsdu: Enable/Disable MSDU aggregation
+ * @txrate_type: Tx bitrate mask type
+ * @txrate_mask: Tx bitrate to be applied for the TID
  */
 struct cfg80211_tid_cfg {
 	bool config_override;
@@ -653,6 +668,8 @@ struct cfg80211_tid_cfg {
 	enum nl80211_tid_config ampdu;
 	enum nl80211_tid_config rtscts;
 	enum nl80211_tid_config amsdu;
+	enum nl80211_tx_rate_setting txrate_type;
+	struct cfg80211_bitrate_mask txrate_mask;
 };
 
 /**
@@ -1007,18 +1024,6 @@ struct cfg80211_acl_data {
 	struct mac_address mac_addrs[];
 };
 
-/*
- * cfg80211_bitrate_mask - masks for bitrate control
- */
-struct cfg80211_bitrate_mask {
-	struct {
-		u32 legacy;
-		u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
-		u16 vht_mcs[NL80211_VHT_NSS_MAX];
-		enum nl80211_txrate_gi gi;
-	} control[NUM_NL80211_BANDS];
-};
-
 /**
  * enum cfg80211_ap_settings_flags - AP settings flags
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0f324b6b81cc..c14666b75e57 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4841,6 +4841,17 @@ enum nl80211_tid_config {
 	NL80211_TID_CONFIG_DISABLE,
 };
 
+/* enum nl80211_tx_rate_setting - TX rate configuration type
+ * @NL80211_TX_RATE_AUTOMATIC: automatically determine TX rate
+ * @NL80211_TX_RATE_LIMITED: limit the TX rate by the TX rate parameter
+ * @NL80211_TX_RATE_FIXED: fix TX rate to the TX rate parameter
+ */
+enum nl80211_tx_rate_setting {
+	NL80211_TX_RATE_AUTOMATIC,
+	NL80211_TX_RATE_LIMITED,
+	NL80211_TX_RATE_FIXED,
+};
+
 /* enum nl80211_tid_config_attr - TID specific configuration.
  * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values
  * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported
@@ -4876,6 +4887,14 @@ enum nl80211_tid_config {
  * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation
  *	for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS.
  *	Its type is u8, using the values from &nl80211_tid_config.
+ * @NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE: This attribute will be useful
+ *	to notfiy the driver that what type of txrate should be used
+ *	for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. using
+ *	the values form &nl80211_tx_rate_setting.
+ * @NL80211_TID_CONFIG_ATTR_TX_RATE: Data frame TX rate mask should be applied
+ *	with the parameters passed through %NL80211_ATTR_TX_RATES.
+ *	configuration is applied to the data frame for the tid to that connected
+ *	station.
  */
 enum nl80211_tid_config_attr {
 	__NL80211_TID_CONFIG_ATTR_INVALID,
@@ -4890,6 +4909,8 @@ enum nl80211_tid_config_attr {
 	NL80211_TID_CONFIG_ATTR_AMPDU_CTRL,
 	NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL,
 	NL80211_TID_CONFIG_ATTR_AMSDU_CTRL,
+	NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE,
+	NL80211_TID_CONFIG_ATTR_TX_RATE,
 
 	/* keep last */
 	__NL80211_TID_CONFIG_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7ea764865546..22c4d13e28cb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -329,6 +329,15 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = {
 	[NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG },
 };
 
+static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
+	[NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
+				    .len = NL80211_MAX_SUPP_RATES },
+	[NL80211_TXRATE_HT] = { .type = NLA_BINARY,
+				.len = NL80211_MAX_SUPP_HT_RATES },
+	[NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
+	[NL80211_TXRATE_GI] = { .type = NLA_U8 },
+};
+
 static const struct nla_policy
 nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
 	[NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 },
@@ -345,6 +354,10 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
 			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
 	[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] =
 			NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
+	[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] =
+			NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED),
+	[NL80211_TID_CONFIG_ATTR_TX_RATE] =
+			NLA_POLICY_NESTED(nl80211_txattr_policy),
 };
 
 static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
@@ -4388,16 +4401,9 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
 	return true;
 }
 
-static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
-	[NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
-				    .len = NL80211_MAX_SUPP_RATES },
-	[NL80211_TXRATE_HT] = { .type = NLA_BINARY,
-				.len = NL80211_MAX_SUPP_HT_RATES },
-	[NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
-	[NL80211_TXRATE_GI] = { .type = NLA_U8 },
-};
-
 static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
+					 struct nlattr *attrs[],
+					 enum nl80211_attrs attr,
 					 struct cfg80211_bitrate_mask *mask)
 {
 	struct nlattr *tb[NL80211_TXRATE_MAX + 1];
@@ -4428,14 +4434,14 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 	}
 
 	/* if no rates are given set it back to the defaults */
-	if (!info->attrs[NL80211_ATTR_TX_RATES])
+	if (!attrs[attr])
 		goto out;
 
 	/* The nested attribute uses enum nl80211_band as the index. This maps
 	 * directly to the enum nl80211_band values used in cfg80211.
 	 */
 	BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
-	nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
+	nla_for_each_nested(tx_rates, attrs[attr], rem) {
 		enum nl80211_band band = nla_type(tx_rates);
 		int err;
 
@@ -4940,7 +4946,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_TX_RATES]) {
-		err = nl80211_parse_tx_bitrate_mask(info, &params.beacon_rate);
+		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
+						    NL80211_ATTR_TX_RATES,
+						    &params.beacon_rate);
 		if (err)
 			return err;
 
@@ -10753,7 +10761,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 	if (!rdev->ops->set_bitrate_mask)
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_tx_bitrate_mask(info, &mask);
+	err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
+					    NL80211_ATTR_TX_RATES, &mask);
 	if (err)
 		return err;
 
@@ -11359,7 +11368,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (info->attrs[NL80211_ATTR_TX_RATES]) {
-		err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate);
+		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
+						    NL80211_ATTR_TX_RATES,
+						    &setup.beacon_rate);
 		if (err)
 			return err;
 
@@ -14139,6 +14150,23 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]);
 	}
 
+	if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) {
+		u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr;
+
+		tid_conf->txrate_type = nla_get_u8(attrs[idx]);
+
+		if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) {
+			attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
+			err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
+						    &tid_conf->txrate_mask);
+			if (err)
+				return err;
+
+			tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE);
+		}
+		tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE);
+	}
+
 	if (peer)
 		mask = rdev->wiphy.tid_config_support.peer;
 	else
-- 
cgit v1.2.3


From a3b018febccd3686c39e86e98b5081bde014fc66 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Sun, 17 May 2020 18:30:19 +0200
Subject: cfg80211: fix CFG82011_CRDA_SUPPORT still mentioning internal regdb

Back with commit c8c240e284b3 (cfg80211: reg: remove support for
built-in regdb, 2015-10-15), support for using CFG80211_INTERNAL_REGDB
was removed in favor of loading the regulatory database as firmware
file. The documentation of CFG80211_CRDA_SUPPORT was not adjusted,
though, which is why it still mentions mentions the old way of loading
via the internal regulatory database.

Remove it so that the kernel option only mentions using the firmware
file.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Link: https://lore.kernel.org/r/c56e60207fbd0512029de8c6276ee00f73491924.1589732954.git.ps@pks.im
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 63cf7131f601..813e93644ae7 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -181,8 +181,8 @@ config CFG80211_CRDA_SUPPORT
 	default y
 	help
 	  You should enable this option unless you know for sure you have no
-	  need for it, for example when using internal regdb (above) or the
-	  database loaded as a firmware file.
+	  need for it, for example when using the regulatory database loaded as
+	  a firmware file.
 
 	  If unsure, say Y.
 
-- 
cgit v1.2.3


From c11299243370580832c27882dcedf2604f9f48f8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 May 2020 14:33:48 +0200
Subject: mac80211: fix HT-Control field reception for management frames

If we receive management frames with an HT-Control field, we cannot
parse them properly, as we assume a fixed length management header.

Since we don't even need the HTC field (for these frames, or really
at all), just remove it at the beginning of RX.

Reported-by: Haggai Abramovsky <haggai.abramovsky@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200526143346.cf5ce70521c5.I333251a084ec4cfe67b7ef7efe2d2f1a33883931@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 8b0fa5e345f4..21854a61a2b7 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -93,13 +93,44 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
  * This function cleans up the SKB, i.e. it removes all the stuff
  * only useful for monitoring.
  */
-static void remove_monitor_info(struct sk_buff *skb,
-				unsigned int present_fcs_len,
-				unsigned int rtap_space)
+static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb,
+					   unsigned int present_fcs_len,
+					   unsigned int rtap_space)
 {
+	struct ieee80211_hdr *hdr;
+	unsigned int hdrlen;
+	__le16 fc;
+
 	if (present_fcs_len)
 		__pskb_trim(skb, skb->len - present_fcs_len);
 	__pskb_pull(skb, rtap_space);
+
+	hdr = (void *)skb->data;
+	fc = hdr->frame_control;
+
+	/*
+	 * Remove the HT-Control field (if present) on management
+	 * frames after we've sent the frame to monitoring. We
+	 * (currently) don't need it, and don't properly parse
+	 * frames with it present, due to the assumption of a
+	 * fixed management header length.
+	 */
+	if (likely(!ieee80211_is_mgmt(fc) || !ieee80211_has_order(fc)))
+		return skb;
+
+	hdrlen = ieee80211_hdrlen(fc);
+	hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_ORDER);
+
+	if (!pskb_may_pull(skb, hdrlen)) {
+		dev_kfree_skb(skb);
+		return NULL;
+	}
+
+	memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data,
+		hdrlen - IEEE80211_HT_CTL_LEN);
+	__pskb_pull(skb, IEEE80211_HT_CTL_LEN);
+
+	return skb;
 }
 
 static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
@@ -827,8 +858,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 			return NULL;
 		}
 
-		remove_monitor_info(origskb, present_fcs_len, rtap_space);
-		return origskb;
+		return ieee80211_clean_skb(origskb, present_fcs_len,
+					   rtap_space);
 	}
 
 	ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space);
@@ -871,8 +902,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	if (!origskb)
 		return NULL;
 
-	remove_monitor_info(origskb, present_fcs_len, rtap_space);
-	return origskb;
+	return ieee80211_clean_skb(origskb, present_fcs_len, rtap_space);
 }
 
 static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
-- 
cgit v1.2.3


From d8e79f1dbcee8032667c0718a654c749d64f6304 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Wed, 27 May 2020 01:00:20 -0700
Subject: nexthop: Fix type of event_type in call_nexthop_notifiers

Clang warns:

net/ipv4/nexthop.c:841:30: warning: implicit conversion from enumeration
type 'enum nexthop_event_type' to different enumeration type 'enum
fib_event_type' [-Wenum-conversion]
        call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
        ~~~~~~~~~~~~~~~~~~~~~~      ^~~~~~~~~~~~~~~~~
1 warning generated.

Use the right type for event_type so that clang does not warn.

Fixes: 8590ceedb701 ("nexthop: add support for notifiers")
Link: https://github.com/ClangBuiltLinux/linux/issues/1038
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/nexthop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 143011f9b580..ec1282858cb7 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -37,7 +37,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 };
 
 static int call_nexthop_notifiers(struct net *net,
-				  enum fib_event_type event_type,
+				  enum nexthop_event_type event_type,
 				  struct nexthop *nh)
 {
 	int err;
-- 
cgit v1.2.3


From 20f6a05ef63594feb0c6dfbd629da0448b43124d Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Wed, 27 May 2020 12:34:30 +0000
Subject: bridge: mrp: Rework the MRP netlink interface

This patch reworks the MRP netlink interface. Before, each attribute
represented a binary structure which made it hard to be extended.
Therefore update the MRP netlink interface such that each existing
attribute to be a nested attribute which contains the fields of the
binary structures.
In this way the MRP netlink interface can be extended without breaking
the backwards compatibility. It is also using strict checking for
attributes under the MRP top attribute.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  64 ++++++++--
 net/bridge/br_mrp.c            |   8 +-
 net/bridge/br_mrp_netlink.c    | 266 +++++++++++++++++++++++++++++++++++------
 net/bridge/br_private_mrp.h    |   2 +-
 4 files changed, 290 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index bd8c95488f16..5a43eb86c93b 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -169,17 +169,69 @@ enum {
 	__IFLA_BRIDGE_MRP_MAX,
 };
 
+#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_INSTANCE_UNSPEC,
+	IFLA_BRIDGE_MRP_INSTANCE_RING_ID,
+	IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX,
+	IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX,
+	__IFLA_BRIDGE_MRP_INSTANCE_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_INSTANCE_MAX (__IFLA_BRIDGE_MRP_INSTANCE_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC,
+	IFLA_BRIDGE_MRP_PORT_STATE_STATE,
+	__IFLA_BRIDGE_MRP_PORT_STATE_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_PORT_STATE_MAX (__IFLA_BRIDGE_MRP_PORT_STATE_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC,
+	IFLA_BRIDGE_MRP_PORT_ROLE_ROLE,
+	__IFLA_BRIDGE_MRP_PORT_ROLE_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_PORT_ROLE_MAX (__IFLA_BRIDGE_MRP_PORT_ROLE_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_RING_STATE_UNSPEC,
+	IFLA_BRIDGE_MRP_RING_STATE_RING_ID,
+	IFLA_BRIDGE_MRP_RING_STATE_STATE,
+	__IFLA_BRIDGE_MRP_RING_STATE_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_RING_STATE_MAX (__IFLA_BRIDGE_MRP_RING_STATE_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC,
+	IFLA_BRIDGE_MRP_RING_ROLE_RING_ID,
+	IFLA_BRIDGE_MRP_RING_ROLE_ROLE,
+	__IFLA_BRIDGE_MRP_RING_ROLE_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_RING_ROLE_MAX (__IFLA_BRIDGE_MRP_RING_ROLE_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MRP_START_TEST_UNSPEC,
+	IFLA_BRIDGE_MRP_START_TEST_RING_ID,
+	IFLA_BRIDGE_MRP_START_TEST_INTERVAL,
+	IFLA_BRIDGE_MRP_START_TEST_MAX_MISS,
+	IFLA_BRIDGE_MRP_START_TEST_PERIOD,
+	__IFLA_BRIDGE_MRP_START_TEST_MAX,
+};
+
+#define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1)
+
 struct br_mrp_instance {
 	__u32 ring_id;
 	__u32 p_ifindex;
 	__u32 s_ifindex;
 };
 
-struct br_mrp_port_role {
-	__u32 ring_id;
-	__u32 role;
-};
-
 struct br_mrp_ring_state {
 	__u32 ring_id;
 	__u32 ring_state;
@@ -197,8 +249,6 @@ struct br_mrp_start_test {
 	__u32 period;
 };
 
-#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1)
-
 struct bridge_stp_xstats {
 	__u64 transition_blk;
 	__u64 transition_fwd;
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index 528d767eb026..8ea59504ef47 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -376,24 +376,24 @@ int br_mrp_set_port_state(struct net_bridge_port *p,
  * note: already called with rtnl_lock
  */
 int br_mrp_set_port_role(struct net_bridge_port *p,
-			 struct br_mrp_port_role *role)
+			 enum br_mrp_port_role_type role)
 {
 	struct br_mrp *mrp;
 
 	if (!p || !(p->flags & BR_MRP_AWARE))
 		return -EINVAL;
 
-	mrp = br_mrp_find_id(p->br, role->ring_id);
+	mrp = br_mrp_find_port(p->br, p);
 
 	if (!mrp)
 		return -EINVAL;
 
-	if (role->role == BR_MRP_PORT_ROLE_PRIMARY)
+	if (role == BR_MRP_PORT_ROLE_PRIMARY)
 		rcu_assign_pointer(mrp->p_port, p);
 	else
 		rcu_assign_pointer(mrp->s_port, p);
 
-	br_mrp_port_switchdev_set_role(p, role->role);
+	br_mrp_port_switchdev_set_role(p, role);
 
 	return 0;
 }
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index 4a08a99519b0..d9de780d2ce0 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -8,19 +8,222 @@
 
 static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = {
 	[IFLA_BRIDGE_MRP_UNSPEC]	= { .type = NLA_REJECT },
-	[IFLA_BRIDGE_MRP_INSTANCE]	= { .type = NLA_EXACT_LEN,
-				    .len = sizeof(struct br_mrp_instance)},
-	[IFLA_BRIDGE_MRP_PORT_STATE]	= { .type = NLA_U32 },
-	[IFLA_BRIDGE_MRP_PORT_ROLE]	= { .type = NLA_EXACT_LEN,
-				    .len = sizeof(struct br_mrp_port_role)},
-	[IFLA_BRIDGE_MRP_RING_STATE]	= { .type = NLA_EXACT_LEN,
-				    .len = sizeof(struct br_mrp_ring_state)},
-	[IFLA_BRIDGE_MRP_RING_ROLE]	= { .type = NLA_EXACT_LEN,
-				    .len = sizeof(struct br_mrp_ring_role)},
-	[IFLA_BRIDGE_MRP_START_TEST]	= { .type = NLA_EXACT_LEN,
-				    .len = sizeof(struct br_mrp_start_test)},
+	[IFLA_BRIDGE_MRP_INSTANCE]	= { .type = NLA_NESTED },
+	[IFLA_BRIDGE_MRP_PORT_STATE]	= { .type = NLA_NESTED },
+	[IFLA_BRIDGE_MRP_PORT_ROLE]	= { .type = NLA_NESTED },
+	[IFLA_BRIDGE_MRP_RING_STATE]	= { .type = NLA_NESTED },
+	[IFLA_BRIDGE_MRP_RING_ROLE]	= { .type = NLA_NESTED },
+	[IFLA_BRIDGE_MRP_START_TEST]	= { .type = NLA_NESTED },
 };
 
+static const struct nla_policy
+br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_INSTANCE_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr,
+				 int cmd, struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1];
+	struct br_mrp_instance inst;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr,
+			       br_mrp_instance_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] ||
+	    !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] ||
+	    !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX");
+		return -EINVAL;
+	}
+
+	memset(&inst, 0, sizeof(inst));
+
+	inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]);
+	inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]);
+	inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]);
+
+	if (cmd == RTM_SETLINK)
+		return br_mrp_add(br, &inst);
+	else
+		return br_mrp_del(br, &inst);
+
+	return 0;
+}
+
+static const struct nla_policy
+br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_PORT_STATE_STATE]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_port_state_parse(struct net_bridge_port *p,
+				   struct nlattr *attr,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1];
+	enum br_mrp_port_state_type state;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr,
+			       br_mrp_port_state_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE");
+		return -EINVAL;
+	}
+
+	state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]);
+
+	return br_mrp_set_port_state(p, state);
+}
+
+static const struct nla_policy
+br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_port_role_parse(struct net_bridge_port *p,
+				  struct nlattr *attr,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1];
+	enum br_mrp_port_role_type role;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr,
+			       br_mrp_port_role_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE");
+		return -EINVAL;
+	}
+
+	role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]);
+
+	return br_mrp_set_port_role(p, role);
+}
+
+static const struct nla_policy
+br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_RING_STATE_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_RING_STATE_STATE]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1];
+	struct br_mrp_ring_state state;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr,
+			       br_mrp_ring_state_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] ||
+	    !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing attribute: RING_ID or STATE");
+		return -EINVAL;
+	}
+
+	memset(&state, 0x0, sizeof(state));
+
+	state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]);
+	state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]);
+
+	return br_mrp_set_ring_state(br, &state);
+}
+
+static const struct nla_policy
+br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1];
+	struct br_mrp_ring_role role;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr,
+			       br_mrp_ring_role_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] ||
+	    !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing attribute: RING_ID or ROLE");
+		return -EINVAL;
+	}
+
+	memset(&role, 0x0, sizeof(role));
+
+	role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]);
+	role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]);
+
+	return br_mrp_set_ring_role(br, &role);
+}
+
+static const struct nla_policy
+br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = {
+	[IFLA_BRIDGE_MRP_START_TEST_UNSPEC]	= { .type = NLA_REJECT },
+	[IFLA_BRIDGE_MRP_START_TEST_RING_ID]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_START_TEST_PERIOD]	= { .type = NLA_U32 },
+};
+
+static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1];
+	struct br_mrp_start_test test;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr,
+			       br_mrp_start_test_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] ||
+	    !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] ||
+	    !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] ||
+	    !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD");
+		return -EINVAL;
+	}
+
+	memset(&test, 0x0, sizeof(test));
+
+	test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]);
+	test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]);
+	test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]);
+	test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]);
+
+	return br_mrp_start_test(br, &test);
+}
+
 int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
 		 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
 {
@@ -44,58 +247,45 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
 		return err;
 
 	if (tb[IFLA_BRIDGE_MRP_INSTANCE]) {
-		struct br_mrp_instance *instance =
-			nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]);
-
-		if (cmd == RTM_SETLINK)
-			err = br_mrp_add(br, instance);
-		else
-			err = br_mrp_del(br, instance);
+		err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE],
+					    cmd, extack);
 		if (err)
 			return err;
 	}
 
 	if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) {
-		enum br_mrp_port_state_type state =
-			nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]);
-
-		err = br_mrp_set_port_state(p, state);
+		err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE],
+					      extack);
 		if (err)
 			return err;
 	}
 
 	if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) {
-		struct br_mrp_port_role *role =
-			nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]);
-
-		err = br_mrp_set_port_role(p, role);
+		err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE],
+					     extack);
 		if (err)
 			return err;
 	}
 
 	if (tb[IFLA_BRIDGE_MRP_RING_STATE]) {
-		struct br_mrp_ring_state *state =
-			nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]);
-
-		err = br_mrp_set_ring_state(br, state);
+		err = br_mrp_ring_state_parse(br,
+					      tb[IFLA_BRIDGE_MRP_RING_STATE],
+					      extack);
 		if (err)
 			return err;
 	}
 
 	if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) {
-		struct br_mrp_ring_role *role =
-			nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]);
-
-		err = br_mrp_set_ring_role(br, role);
+		err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE],
+					     extack);
 		if (err)
 			return err;
 	}
 
 	if (tb[IFLA_BRIDGE_MRP_START_TEST]) {
-		struct br_mrp_start_test *test =
-			nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]);
-
-		err = br_mrp_start_test(br, test);
+		err = br_mrp_start_test_parse(br,
+					      tb[IFLA_BRIDGE_MRP_START_TEST],
+					      extack);
 		if (err)
 			return err;
 	}
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
index 2921a4b59f8e..a0f53cc3ab85 100644
--- a/net/bridge/br_private_mrp.h
+++ b/net/bridge/br_private_mrp.h
@@ -37,7 +37,7 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance);
 int br_mrp_set_port_state(struct net_bridge_port *p,
 			  enum br_mrp_port_state_type state);
 int br_mrp_set_port_role(struct net_bridge_port *p,
-			 struct br_mrp_port_role *role);
+			 enum br_mrp_port_role_type role);
 int br_mrp_set_ring_state(struct net_bridge *br,
 			  struct br_mrp_ring_state *state);
 int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role);
-- 
cgit v1.2.3


From cb8aa9a3affb7d23b11b11fbed41e2feaabc4b0a Mon Sep 17 00:00:00 2001
From: Romain Bellan <romain.bellan@wifirst.fr>
Date: Mon, 4 May 2020 21:34:29 +0200
Subject: netfilter: ctnetlink: add kernel side filtering for dump

Conntrack dump does not support kernel side filtering (only get exists,
but it returns only one entry. And user has to give a full valid tuple)

It means that userspace has to implement filtering after receiving many
irrelevant entries, consuming resources (conntrack table is sometimes
very huge, much more than a routing table for example).

This patch adds filtering in kernel side. To achieve this goal, we:

 * Add a new CTA_FILTER netlink attributes, actually a flag list to
   parametize filtering
 * Convert some *nlattr_to_tuple() functions, to allow a partial parsing
   of CTA_TUPLE_ORIG and CTA_TUPLE_REPLY (so nf_conntrack_tuple it not
   fully set)

Filtering is now possible on:
 * IP SRC/DST values
 * Ports for TCP and UDP flows
 * IMCP(v6) codes types and IDs

Filtering is done as an "AND" operator. For example, when flags
PROTO_SRC_PORT, PROTO_NUM and IP_SRC are sets, only entries matching all
values are dumped.

Changes since v1:
  Set NLM_F_DUMP_FILTERED in nlm flags if entries are filtered

Changes since v2:
  Move several constants to nf_internals.h
  Move a fix on netlink values check in a separate patch
  Add a check on not-supported flags
  Return EOPNOTSUPP if CDA_FILTER is set in ctnetlink_flush_conntrack
  (not yet implemented)
  Code style issues

Changes since v3:
  Fix compilation warning reported by kbuild test robot

Changes since v4:
  Fix a regression introduced in v3 (returned EINVAL for valid netlink
  messages without CTA_MARK)

Changes since v5:
  Change definition of CTA_FILTER_F_ALL
  Fix a regression when CTA_TUPLE_ZONE is not set

Signed-off-by: Romain Bellan <romain.bellan@wifirst.fr>
Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h       |   6 +-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |   9 +
 net/netfilter/nf_conntrack_core.c                  |  19 +-
 net/netfilter/nf_conntrack_netlink.c               | 334 ++++++++++++++++++---
 net/netfilter/nf_conntrack_proto_icmp.c            |  40 ++-
 net/netfilter/nf_conntrack_proto_icmpv6.c          |  42 ++-
 net/netfilter/nf_internals.h                       |  17 ++
 7 files changed, 394 insertions(+), 73 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 4cad1f0a327a..88186b95b3c2 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -42,7 +42,8 @@ struct nf_conntrack_l4proto {
 	/* Calculate tuple nlattr size */
 	unsigned int (*nlattr_tuple_size)(void);
 	int (*nlattr_to_tuple)(struct nlattr *tb[],
-			       struct nf_conntrack_tuple *t);
+			       struct nf_conntrack_tuple *t,
+			       u_int32_t flags);
 	const struct nla_policy *nla_policy;
 
 	struct {
@@ -152,7 +153,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto);
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *tuple);
 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
-			       struct nf_conntrack_tuple *t);
+			       struct nf_conntrack_tuple *t,
+			       u_int32_t flags);
 unsigned int nf_ct_port_nlattr_tuple_size(void);
 extern const struct nla_policy nf_ct_port_nla_policy[];
 
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 1d41810d17e2..262881792671 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -55,6 +55,7 @@ enum ctattr_type {
 	CTA_LABELS,
 	CTA_LABELS_MASK,
 	CTA_SYNPROXY,
+	CTA_FILTER,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
@@ -276,4 +277,12 @@ enum ctattr_expect_stats {
 };
 #define CTA_STATS_EXP_MAX (__CTA_STATS_EXP_MAX - 1)
 
+enum ctattr_filter {
+	CTA_FILTER_UNSPEC,
+	CTA_FILTER_ORIG_FLAGS,
+	CTA_FILTER_REPLY_FLAGS,
+	__CTA_FILTER_MAX
+};
+#define CTA_FILTER_MAX (__CTA_FILTER_MAX - 1)
+
 #endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1d57b95d3481..8abb1727bcc4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1974,13 +1974,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
 
 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
-			       struct nf_conntrack_tuple *t)
+			       struct nf_conntrack_tuple *t,
+			       u_int32_t flags)
 {
-	if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
-		return -EINVAL;
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
+		if (!tb[CTA_PROTO_SRC_PORT])
+			return -EINVAL;
+
+		t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+	}
 
-	t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
-	t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
+		if (!tb[CTA_PROTO_DST_PORT])
+			return -EINVAL;
+
+		t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9ddfcd002d3b..d7bd8b1f27d5 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -54,6 +54,8 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
+#include "nf_internals.h"
+
 MODULE_LICENSE("GPL");
 
 static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
@@ -544,14 +546,16 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
 
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
-		    struct nf_conn *ct, bool extinfo)
+		    struct nf_conn *ct, bool extinfo, unsigned int flags)
 {
 	const struct nf_conntrack_zone *zone;
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
-	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+	unsigned int event;
 
+	if (portid)
+		flags |= NLM_F_MULTI;
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
@@ -847,17 +851,70 @@ static int ctnetlink_done(struct netlink_callback *cb)
 }
 
 struct ctnetlink_filter {
+	u_int32_t cta_flags;
 	u8 family;
+
+	u_int32_t orig_flags;
+	u_int32_t reply_flags;
+
+	struct nf_conntrack_tuple orig;
+	struct nf_conntrack_tuple reply;
+	struct nf_conntrack_zone zone;
+
 	struct {
 		u_int32_t val;
 		u_int32_t mask;
 	} mark;
 };
 
+static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
+	[CTA_FILTER_ORIG_FLAGS]		= { .type = NLA_U32 },
+	[CTA_FILTER_REPLY_FLAGS]	= { .type = NLA_U32 },
+};
+
+static int ctnetlink_parse_filter(const struct nlattr *attr,
+				  struct ctnetlink_filter *filter)
+{
+	struct nlattr *tb[CTA_FILTER_MAX + 1];
+	int ret = 0;
+
+	ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy,
+			       NULL);
+	if (ret)
+		return ret;
+
+	if (tb[CTA_FILTER_ORIG_FLAGS]) {
+		filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]);
+		if (filter->orig_flags & ~CTA_FILTER_F_ALL)
+			return -EOPNOTSUPP;
+	}
+
+	if (tb[CTA_FILTER_REPLY_FLAGS]) {
+		filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]);
+		if (filter->reply_flags & ~CTA_FILTER_F_ALL)
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int ctnetlink_parse_zone(const struct nlattr *attr,
+				struct nf_conntrack_zone *zone);
+static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+					 struct nf_conntrack_tuple *tuple,
+					 u32 type, u_int8_t l3num,
+					 struct nf_conntrack_zone *zone,
+					 u_int32_t flags);
+
+/* applied on filters */
+#define CTA_FILTER_F_CTA_MARK			(1 << 0)
+#define CTA_FILTER_F_CTA_MARK_MASK		(1 << 1)
+
 static struct ctnetlink_filter *
 ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 {
 	struct ctnetlink_filter *filter;
+	int err;
 
 #ifndef CONFIG_NF_CONNTRACK_MARK
 	if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
@@ -871,14 +928,65 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
 	filter->family = family;
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-	if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+	if (cda[CTA_MARK]) {
 		filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
-		filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+		filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK);
+
+		if (cda[CTA_MARK_MASK]) {
+			filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+			filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK);
+		} else {
+			filter->mark.mask = 0xffffffff;
+		}
+	} else if (cda[CTA_MARK_MASK]) {
+		return ERR_PTR(-EINVAL);
 	}
 #endif
+	if (!cda[CTA_FILTER])
+		return filter;
+
+	err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	if (filter->orig_flags) {
+		if (!cda[CTA_TUPLE_ORIG])
+			return ERR_PTR(-EINVAL);
+
+		err = ctnetlink_parse_tuple_filter(cda, &filter->orig,
+						   CTA_TUPLE_ORIG,
+						   filter->family,
+						   &filter->zone,
+						   filter->orig_flags);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	if (filter->reply_flags) {
+		if (!cda[CTA_TUPLE_REPLY])
+			return ERR_PTR(-EINVAL);
+
+		err = ctnetlink_parse_tuple_filter(cda, &filter->reply,
+						   CTA_TUPLE_REPLY,
+						   filter->family,
+						   &filter->zone,
+						   filter->orig_flags);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
 	return filter;
 }
 
+static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
+{
+	return family || cda[CTA_MARK] || cda[CTA_FILTER];
+}
+
 static int ctnetlink_start(struct netlink_callback *cb)
 {
 	const struct nlattr * const *cda = cb->data;
@@ -886,7 +994,7 @@ static int ctnetlink_start(struct netlink_callback *cb)
 	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	u8 family = nfmsg->nfgen_family;
 
-	if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+	if (ctnetlink_needs_filter(family, cda)) {
 		filter = ctnetlink_alloc_filter(cda, family);
 		if (IS_ERR(filter))
 			return PTR_ERR(filter);
@@ -896,9 +1004,79 @@ static int ctnetlink_start(struct netlink_callback *cb)
 	return 0;
 }
 
+static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple,
+					struct nf_conntrack_tuple *ct_tuple,
+					u_int32_t flags, int family)
+{
+	switch (family) {
+	case NFPROTO_IPV4:
+		if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+		    filter_tuple->src.u3.ip != ct_tuple->src.u3.ip)
+			return  0;
+
+		if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+		    filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip)
+			return  0;
+		break;
+	case NFPROTO_IPV6:
+		if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+		    !ipv6_addr_cmp(&filter_tuple->src.u3.in6,
+				   &ct_tuple->src.u3.in6))
+			return 0;
+
+		if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+		    !ipv6_addr_cmp(&filter_tuple->dst.u3.in6,
+				   &ct_tuple->dst.u3.in6))
+			return 0;
+		break;
+	}
+
+	if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) &&
+	    filter_tuple->dst.protonum != ct_tuple->dst.protonum)
+		return 0;
+
+	switch (ct_tuple->dst.protonum) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) &&
+		    filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port)
+			return 0;
+
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) &&
+		    filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port)
+			return 0;
+		break;
+	case IPPROTO_ICMP:
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) &&
+		    filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+			return 0;
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) &&
+		    filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+			return 0;
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) &&
+		    filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+			return 0;
+		break;
+	case IPPROTO_ICMPV6:
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) &&
+		    filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+			return 0;
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) &&
+		    filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+			return 0;
+		if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) &&
+		    filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+			return 0;
+		break;
+	}
+
+	return 1;
+}
+
 static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 {
 	struct ctnetlink_filter *filter = data;
+	struct nf_conntrack_tuple *tuple;
 
 	if (filter == NULL)
 		goto out;
@@ -910,8 +1088,28 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
 	if (filter->family && nf_ct_l3num(ct) != filter->family)
 		goto ignore_entry;
 
+	if (filter->orig_flags) {
+		tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
+		if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
+						  filter->orig_flags,
+						  filter->family))
+			goto ignore_entry;
+	}
+
+	if (filter->reply_flags) {
+		tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY);
+		if (!ctnetlink_filter_match_tuple(&filter->reply, tuple,
+						  filter->reply_flags,
+						  filter->family))
+			goto ignore_entry;
+	}
+
 #ifdef CONFIG_NF_CONNTRACK_MARK
-	if ((ct->mark & filter->mark.mask) != filter->mark.val)
+	if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) &&
+	    (ct->mark & filter->mark.mask) != filter->mark.val)
+		goto ignore_entry;
+	else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) &&
+		 ct->mark != filter->mark.val)
 		goto ignore_entry;
 #endif
 
@@ -925,6 +1123,7 @@ ignore_entry:
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
 	struct net *net = sock_net(skb->sk);
 	struct nf_conn *ct, *last;
 	struct nf_conntrack_tuple_hash *h;
@@ -979,7 +1178,7 @@ restart:
 			ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
 					    cb->nlh->nlmsg_seq,
 					    NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
-					    ct, true);
+					    ct, true, flags);
 			if (res < 0) {
 				nf_conntrack_get(&ct->ct_general);
 				cb->args[1] = (unsigned long)ct;
@@ -1014,31 +1213,50 @@ out:
 }
 
 static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *t)
+				struct nf_conntrack_tuple *t,
+				u_int32_t flags)
 {
-	if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
-		return -EINVAL;
+	if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+		if (!tb[CTA_IP_V4_SRC])
+			return -EINVAL;
+
+		t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+		if (!tb[CTA_IP_V4_DST])
+			return -EINVAL;
 
-	t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
-	t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+		t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+	}
 
 	return 0;
 }
 
 static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *t)
+				struct nf_conntrack_tuple *t,
+				u_int32_t flags)
 {
-	if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
-		return -EINVAL;
+	if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+		if (!tb[CTA_IP_V6_SRC])
+			return -EINVAL;
 
-	t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
-	t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+		t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+		if (!tb[CTA_IP_V6_DST])
+			return -EINVAL;
+
+		t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+	}
 
 	return 0;
 }
 
 static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
-				    struct nf_conntrack_tuple *tuple)
+				    struct nf_conntrack_tuple *tuple,
+				    u_int32_t flags)
 {
 	struct nlattr *tb[CTA_IP_MAX+1];
 	int ret = 0;
@@ -1054,10 +1272,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
 
 	switch (tuple->src.l3num) {
 	case NFPROTO_IPV4:
-		ret = ipv4_nlattr_to_tuple(tb, tuple);
+		ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
 		break;
 	case NFPROTO_IPV6:
-		ret = ipv6_nlattr_to_tuple(tb, tuple);
+		ret = ipv6_nlattr_to_tuple(tb, tuple, flags);
 		break;
 	}
 
@@ -1069,7 +1287,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
 };
 
 static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
-				       struct nf_conntrack_tuple *tuple)
+				       struct nf_conntrack_tuple *tuple,
+				       u_int32_t flags)
 {
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *tb[CTA_PROTO_MAX+1];
@@ -1080,8 +1299,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
 	if (ret < 0)
 		return ret;
 
+	if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)))
+		return 0;
+
 	if (!tb[CTA_PROTO_NUM])
 		return -EINVAL;
+
 	tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
 
 	rcu_read_lock();
@@ -1092,7 +1315,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
 						     l4proto->nla_policy,
 						     NULL);
 		if (ret == 0)
-			ret = l4proto->nlattr_to_tuple(tb, tuple);
+			ret = l4proto->nlattr_to_tuple(tb, tuple, flags);
 	}
 
 	rcu_read_unlock();
@@ -1143,10 +1366,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
 	[CTA_TUPLE_ZONE]	= { .type = NLA_U16 },
 };
 
+#define CTA_FILTER_F_ALL_CTA_PROTO \
+  (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \
+   CTA_FILTER_F_CTA_PROTO_DST_PORT | \
+   CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \
+   CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \
+   CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
+   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+
 static int
-ctnetlink_parse_tuple(const struct nlattr * const cda[],
-		      struct nf_conntrack_tuple *tuple, u32 type,
-		      u_int8_t l3num, struct nf_conntrack_zone *zone)
+ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+			      struct nf_conntrack_tuple *tuple, u32 type,
+			      u_int8_t l3num, struct nf_conntrack_zone *zone,
+			      u_int32_t flags)
 {
 	struct nlattr *tb[CTA_TUPLE_MAX+1];
 	int err;
@@ -1158,23 +1392,32 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	if (err < 0)
 		return err;
 
-	if (!tb[CTA_TUPLE_IP])
-		return -EINVAL;
 
 	tuple->src.l3num = l3num;
 
-	err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
-	if (err < 0)
-		return err;
+	if (flags & CTA_FILTER_FLAG(CTA_IP_DST) ||
+	    flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+		if (!tb[CTA_TUPLE_IP])
+			return -EINVAL;
 
-	if (!tb[CTA_TUPLE_PROTO])
-		return -EINVAL;
+		err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags);
+		if (err < 0)
+			return err;
+	}
 
-	err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
-	if (err < 0)
-		return err;
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) {
+		if (!tb[CTA_TUPLE_PROTO])
+			return -EINVAL;
 
-	if (tb[CTA_TUPLE_ZONE]) {
+		err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags);
+		if (err < 0)
+			return err;
+	} else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) {
+		/* Can't manage proto flags without a protonum  */
+		return -EINVAL;
+	}
+
+	if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) {
 		if (!zone)
 			return -EINVAL;
 
@@ -1193,6 +1436,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
 	return 0;
 }
 
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+		      struct nf_conntrack_tuple *tuple, u32 type,
+		      u_int8_t l3num, struct nf_conntrack_zone *zone)
+{
+	return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone,
+					    CTA_FILTER_FLAG(ALL));
+}
+
 static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
 	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1240,6 +1492,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
 				    .len = NF_CT_LABELS_MAX_SIZE },
 	[CTA_LABELS_MASK]	= { .type = NLA_BINARY,
 				    .len = NF_CT_LABELS_MAX_SIZE },
+	[CTA_FILTER]		= { .type = NLA_NESTED },
 };
 
 static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
@@ -1256,7 +1509,10 @@ static int ctnetlink_flush_conntrack(struct net *net,
 {
 	struct ctnetlink_filter *filter = NULL;
 
-	if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+	if (ctnetlink_needs_filter(family, cda)) {
+		if (cda[CTA_FILTER])
+			return -EOPNOTSUPP;
+
 		filter = ctnetlink_alloc_filter(cda, family);
 		if (IS_ERR(filter))
 			return PTR_ERR(filter);
@@ -1385,7 +1641,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
 	}
 
 	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
-				  NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true);
+				  NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0);
 	nf_ct_put(ct);
 	if (err <= 0)
 		goto free;
@@ -1458,7 +1714,7 @@ restart:
 			res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
 						  cb->nlh->nlmsg_seq,
 						  NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
-						  ct, dying ? true : false);
+						  ct, dying ? true : false, 0);
 			if (res < 0) {
 				if (!atomic_inc_not_zero(&ct->ct_general.use))
 					continue;
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index c2e3dff773bc..4efd8741c105 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -20,6 +20,8 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_log.h>
 
+#include "nf_internals.h"
+
 static const unsigned int nf_ct_icmp_timeout = 30*HZ;
 
 bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -271,20 +273,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
 };
 
 static int icmp_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *tuple)
+				struct nf_conntrack_tuple *tuple,
+				u_int32_t flags)
 {
-	if (!tb[CTA_PROTO_ICMP_TYPE] ||
-	    !tb[CTA_PROTO_ICMP_CODE] ||
-	    !tb[CTA_PROTO_ICMP_ID])
-		return -EINVAL;
-
-	tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
-	tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
-	tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
-
-	if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
-	    !invmap[tuple->dst.u.icmp.type])
-		return -EINVAL;
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) {
+		if (!tb[CTA_PROTO_ICMP_TYPE])
+			return -EINVAL;
+
+		tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+		if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+		    !invmap[tuple->dst.u.icmp.type])
+			return -EINVAL;
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) {
+		if (!tb[CTA_PROTO_ICMP_CODE])
+			return -EINVAL;
+
+		tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) {
+		if (!tb[CTA_PROTO_ICMP_ID])
+			return -EINVAL;
+
+		tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 6f9144e1f1c1..facd8c64ec4e 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -24,6 +24,8 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_log.h>
 
+#include "nf_internals.h"
+
 static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
 
 bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
@@ -193,21 +195,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = {
 };
 
 static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *tuple)
+				struct nf_conntrack_tuple *tuple,
+				u_int32_t flags)
 {
-	if (!tb[CTA_PROTO_ICMPV6_TYPE] ||
-	    !tb[CTA_PROTO_ICMPV6_CODE] ||
-	    !tb[CTA_PROTO_ICMPV6_ID])
-		return -EINVAL;
-
-	tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
-	tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
-	tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
-
-	if (tuple->dst.u.icmp.type < 128 ||
-	    tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
-	    !invmap[tuple->dst.u.icmp.type - 128])
-		return -EINVAL;
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) {
+		if (!tb[CTA_PROTO_ICMPV6_TYPE])
+			return -EINVAL;
+
+		tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
+		if (tuple->dst.u.icmp.type < 128 ||
+		    tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
+		    !invmap[tuple->dst.u.icmp.type - 128])
+			return -EINVAL;
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) {
+		if (!tb[CTA_PROTO_ICMPV6_CODE])
+			return -EINVAL;
+
+		tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
+	}
+
+	if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) {
+		if (!tb[CTA_PROTO_ICMPV6_ID])
+			return -EINVAL;
+
+		tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index d6c43902ebd7..832ae64179f0 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -6,6 +6,23 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 
+/* nf_conntrack_netlink.c: applied on tuple filters */
+#define CTA_FILTER_F_CTA_IP_SRC			(1 << 0)
+#define CTA_FILTER_F_CTA_IP_DST			(1 << 1)
+#define CTA_FILTER_F_CTA_TUPLE_ZONE		(1 << 2)
+#define CTA_FILTER_F_CTA_PROTO_NUM		(1 << 3)
+#define CTA_FILTER_F_CTA_PROTO_SRC_PORT		(1 << 4)
+#define CTA_FILTER_F_CTA_PROTO_DST_PORT		(1 << 5)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE	(1 << 6)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE	(1 << 7)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_ID		(1 << 8)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE	(1 << 9)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE	(1 << 10)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID	(1 << 11)
+#define CTA_FILTER_F_MAX			(1 << 12)
+#define CTA_FILTER_F_ALL			(CTA_FILTER_F_MAX-1)
+#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
+
 /* nf_queue.c */
 void nf_queue_nf_hook_drop(struct net *net);
 
-- 
cgit v1.2.3


From d9246a53752fdb777ed176d5f091c4ac0e482bba Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 13:42:44 +0200
Subject: netfilter: nf_tables: generalise flowtable hook parsing

Update nft_flowtable_parse_hook() to take the flowtable hook list as
parameter. This allows to reuse this function to update the hooks.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3558e76e2733..87945b4a6789 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6178,21 +6178,30 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table,
        return ERR_PTR(-ENOENT);
 }
 
+struct nft_flowtable_hook {
+	u32			num;
+	int			priority;
+	struct list_head	list;
+};
+
 static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
 	[NFTA_FLOWTABLE_HOOK_NUM]	= { .type = NLA_U32 },
 	[NFTA_FLOWTABLE_HOOK_PRIORITY]	= { .type = NLA_U32 },
 	[NFTA_FLOWTABLE_HOOK_DEVS]	= { .type = NLA_NESTED },
 };
 
-static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
-					  const struct nlattr *attr,
-					  struct nft_flowtable *flowtable)
+static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
+				    const struct nlattr *attr,
+				    struct nft_flowtable_hook *flowtable_hook,
+				    struct nf_flowtable *ft)
 {
 	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
 	struct nft_hook *hook;
 	int hooknum, priority;
 	int err;
 
+	INIT_LIST_HEAD(&flowtable_hook->list);
+
 	err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
 					  nft_flowtable_hook_policy, NULL);
 	if (err < 0)
@@ -6211,19 +6220,19 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 
 	err = nf_tables_parse_netdev_hooks(ctx->net,
 					   tb[NFTA_FLOWTABLE_HOOK_DEVS],
-					   &flowtable->hook_list);
+					   &flowtable_hook->list);
 	if (err < 0)
 		return err;
 
-	flowtable->hooknum		= hooknum;
-	flowtable->data.priority	= priority;
+	flowtable_hook->priority	= priority;
+	flowtable_hook->num		= hooknum;
 
-	list_for_each_entry(hook, &flowtable->hook_list, list) {
+	list_for_each_entry(hook, &flowtable_hook->list, list) {
 		hook->ops.pf		= NFPROTO_NETDEV;
 		hook->ops.hooknum	= hooknum;
 		hook->ops.priority	= priority;
-		hook->ops.priv		= &flowtable->data;
-		hook->ops.hook		= flowtable->data.type->hook;
+		hook->ops.priv		= ft;
+		hook->ops.hook		= ft->type->hook;
 	}
 
 	return err;
@@ -6336,6 +6345,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				  struct netlink_ext_ack *extack)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nft_flowtable_hook flowtable_hook;
 	const struct nf_flowtable_type *type;
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
@@ -6409,11 +6419,15 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		goto err3;
 
-	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
-					     flowtable);
+	err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
+				       &flowtable_hook, &flowtable->data);
 	if (err < 0)
 		goto err4;
 
+	list_splice(&flowtable_hook.list, &flowtable->hook_list);
+	flowtable->data.priority = flowtable_hook.priority;
+	flowtable->hooknum = flowtable_hook.num;
+
 	err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
 	if (err < 0) {
 		list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-- 
cgit v1.2.3


From f9382669cf5e75ebc7636bd78e637facf27d53f7 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 May 2020 01:00:07 +0200
Subject: netfilter: nf_tables: pass hook list to
 nft_{un,}register_flowtable_net_hooks()

This patch prepares for incremental flowtable hook updates.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 87945b4a6789..1505552aaa74 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6279,23 +6279,24 @@ static void nft_unregister_flowtable_hook(struct net *net,
 }
 
 static void nft_unregister_flowtable_net_hooks(struct net *net,
-					       struct nft_flowtable *flowtable)
+					       struct list_head *hook_list)
 {
 	struct nft_hook *hook;
 
-	list_for_each_entry(hook, &flowtable->hook_list, list)
+	list_for_each_entry(hook, hook_list, list)
 		nf_unregister_net_hook(net, &hook->ops);
 }
 
 static int nft_register_flowtable_net_hooks(struct net *net,
 					    struct nft_table *table,
+					    struct list_head *hook_list,
 					    struct nft_flowtable *flowtable)
 {
 	struct nft_hook *hook, *hook2, *next;
 	struct nft_flowtable *ft;
 	int err, i = 0;
 
-	list_for_each_entry(hook, &flowtable->hook_list, list) {
+	list_for_each_entry(hook, hook_list, list) {
 		list_for_each_entry(ft, &table->flowtables, list) {
 			list_for_each_entry(hook2, &ft->hook_list, list) {
 				if (hook->ops.dev == hook2->ops.dev &&
@@ -6326,7 +6327,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 	return 0;
 
 err_unregister_net_hooks:
-	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+	list_for_each_entry_safe(hook, next, hook_list, list) {
 		if (i-- <= 0)
 			break;
 
@@ -6428,7 +6429,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	flowtable->data.priority = flowtable_hook.priority;
 	flowtable->hooknum = flowtable_hook.num;
 
-	err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
+	err = nft_register_flowtable_net_hooks(ctx.net, table,
+					       &flowtable->hook_list,
+					       flowtable);
 	if (err < 0) {
 		list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
 			list_del_rcu(&hook->list);
@@ -7493,7 +7496,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 						   nft_trans_flowtable(trans),
 						   NFT_MSG_DELFLOWTABLE);
 			nft_unregister_flowtable_net_hooks(net,
-					nft_trans_flowtable(trans));
+					&nft_trans_flowtable(trans)->hook_list);
 			break;
 		}
 	}
@@ -7652,7 +7655,7 @@ static int __nf_tables_abort(struct net *net, bool autoload)
 			trans->ctx.table->use--;
 			list_del_rcu(&nft_trans_flowtable(trans)->list);
 			nft_unregister_flowtable_net_hooks(net,
-					nft_trans_flowtable(trans));
+					&nft_trans_flowtable(trans)->hook_list);
 			break;
 		case NFT_MSG_DELFLOWTABLE:
 			trans->ctx.table->use++;
-- 
cgit v1.2.3


From 389a2cbcb7f15e2af9babdc0c63cec318537e7ed Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 13:43:43 +0200
Subject: netfilter: nf_tables: add nft_flowtable_hooks_destroy()

This patch adds a helper function destroy the flowtable hooks.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1505552aaa74..f5d100787ccb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6339,6 +6339,16 @@ err_unregister_net_hooks:
 	return err;
 }
 
+static void nft_flowtable_hooks_destroy(struct list_head *hook_list)
+{
+	struct nft_hook *hook, *next;
+
+	list_for_each_entry_safe(hook, next, hook_list, list) {
+		list_del_rcu(&hook->list);
+		kfree_rcu(hook, rcu);
+	}
+}
+
 static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				  struct sk_buff *skb,
 				  const struct nlmsghdr *nlh,
@@ -6433,10 +6443,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 					       &flowtable->hook_list,
 					       flowtable);
 	if (err < 0) {
-		list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-			list_del_rcu(&hook->list);
-			kfree_rcu(hook, rcu);
-		}
+		nft_flowtable_hooks_destroy(&flowtable->hook_list);
 		goto err4;
 	}
 
-- 
cgit v1.2.3


From c42d8bda69e291c5551497a02db71b50d95510d4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 13:44:18 +0200
Subject: netfilter: nf_tables: pass hook list to flowtable event notifier

Update the flowtable netlink notifier to take the list of hooks as input.
This allows to reuse this function in incremental flowtable hook updates.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f5d100787ccb..4db70e68d7f4 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6523,7 +6523,8 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 					 u32 portid, u32 seq, int event,
 					 u32 flags, int family,
-					 struct nft_flowtable *flowtable)
+					 struct nft_flowtable *flowtable,
+					 struct list_head *hook_list)
 {
 	struct nlattr *nest, *nest_devs;
 	struct nfgenmsg *nfmsg;
@@ -6559,7 +6560,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 	if (!nest_devs)
 		goto nla_put_failure;
 
-	list_for_each_entry_rcu(hook, &flowtable->hook_list, list) {
+	list_for_each_entry_rcu(hook, hook_list, list) {
 		if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
 			goto nla_put_failure;
 	}
@@ -6612,7 +6613,9 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 							  cb->nlh->nlmsg_seq,
 							  NFT_MSG_NEWFLOWTABLE,
 							  NLM_F_MULTI | NLM_F_APPEND,
-							  table->family, flowtable) < 0)
+							  table->family,
+							  flowtable,
+							  &flowtable->hook_list) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -6709,7 +6712,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
 					    nlh->nlmsg_seq,
 					    NFT_MSG_NEWFLOWTABLE, 0, family,
-					    flowtable);
+					    flowtable, &flowtable->hook_list);
 	if (err < 0)
 		goto err;
 
@@ -6721,6 +6724,7 @@ err:
 
 static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 				       struct nft_flowtable *flowtable,
+				       struct list_head *hook_list,
 				       int event)
 {
 	struct sk_buff *skb;
@@ -6736,7 +6740,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 	err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
 					    ctx->seq, event, 0,
-					    ctx->family, flowtable);
+					    ctx->family, flowtable, hook_list);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -7494,6 +7498,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nft_clear(net, nft_trans_flowtable(trans));
 			nf_tables_flowtable_notify(&trans->ctx,
 						   nft_trans_flowtable(trans),
+						   &nft_trans_flowtable(trans)->hook_list,
 						   NFT_MSG_NEWFLOWTABLE);
 			nft_trans_destroy(trans);
 			break;
@@ -7501,6 +7506,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			list_del_rcu(&nft_trans_flowtable(trans)->list);
 			nf_tables_flowtable_notify(&trans->ctx,
 						   nft_trans_flowtable(trans),
+						   &nft_trans_flowtable(trans)->hook_list,
 						   NFT_MSG_DELFLOWTABLE);
 			nft_unregister_flowtable_net_hooks(net,
 					&nft_trans_flowtable(trans)->hook_list);
-- 
cgit v1.2.3


From 78d9f48f7f44431a25da2b46b3a8812f6ff2b981 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 13:46:47 +0200
Subject: netfilter: nf_tables: add devices to existing flowtable

This patch allows users to add devices to an existing flowtable.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  6 +++
 net/netfilter/nf_tables_api.c     | 97 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 92 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d4e29c952c40..4f58c4411bb4 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1481,10 +1481,16 @@ struct nft_trans_obj {
 
 struct nft_trans_flowtable {
 	struct nft_flowtable		*flowtable;
+	bool				update;
+	struct list_head		hook_list;
 };
 
 #define nft_trans_flowtable(trans)	\
 	(((struct nft_trans_flowtable *)trans->data)->flowtable)
+#define nft_trans_flowtable_update(trans)	\
+	(((struct nft_trans_flowtable *)trans->data)->update)
+#define nft_trans_flowtable_hooks(trans)	\
+	(((struct nft_trans_flowtable *)trans->data)->hook_list)
 
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4db70e68d7f4..98f2cbb97e39 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6349,6 +6349,62 @@ static void nft_flowtable_hooks_destroy(struct list_head *hook_list)
 	}
 }
 
+static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+				struct nft_flowtable *flowtable)
+{
+	const struct nlattr * const *nla = ctx->nla;
+	struct nft_flowtable_hook flowtable_hook;
+	struct nft_hook *hook, *next;
+	struct nft_trans *trans;
+	bool unregister = false;
+	int err;
+
+	err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
+				       &flowtable_hook, &flowtable->data);
+	if (err < 0)
+		return err;
+
+	list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+		if (nft_hook_list_find(&flowtable->hook_list, hook)) {
+			list_del(&hook->list);
+			kfree(hook);
+		}
+	}
+
+	err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
+					       &flowtable_hook.list, flowtable);
+	if (err < 0)
+		goto err_flowtable_update_hook;
+
+	trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE,
+				sizeof(struct nft_trans_flowtable));
+	if (!trans) {
+		unregister = true;
+		err = -ENOMEM;
+		goto err_flowtable_update_hook;
+	}
+
+	nft_trans_flowtable(trans) = flowtable;
+	nft_trans_flowtable_update(trans) = true;
+	INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+	list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));
+
+	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+	return 0;
+
+err_flowtable_update_hook:
+	list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+		if (unregister)
+			nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
+		list_del_rcu(&hook->list);
+		kfree_rcu(hook, rcu);
+	}
+
+	return err;
+
+}
+
 static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				  struct sk_buff *skb,
 				  const struct nlmsghdr *nlh,
@@ -6392,7 +6448,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 			return -EEXIST;
 		}
 
-		return 0;
+		nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+
+		return nft_flowtable_update(&ctx, nlh, flowtable);
 	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -7495,11 +7553,20 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 					     NFT_MSG_DELOBJ);
 			break;
 		case NFT_MSG_NEWFLOWTABLE:
-			nft_clear(net, nft_trans_flowtable(trans));
-			nf_tables_flowtable_notify(&trans->ctx,
-						   nft_trans_flowtable(trans),
-						   &nft_trans_flowtable(trans)->hook_list,
-						   NFT_MSG_NEWFLOWTABLE);
+			if (nft_trans_flowtable_update(trans)) {
+				nf_tables_flowtable_notify(&trans->ctx,
+							   nft_trans_flowtable(trans),
+							   &nft_trans_flowtable_hooks(trans),
+							   NFT_MSG_NEWFLOWTABLE);
+				list_splice(&nft_trans_flowtable_hooks(trans),
+					    &nft_trans_flowtable(trans)->hook_list);
+			} else {
+				nft_clear(net, nft_trans_flowtable(trans));
+				nf_tables_flowtable_notify(&trans->ctx,
+							   nft_trans_flowtable(trans),
+							   &nft_trans_flowtable(trans)->hook_list,
+							   NFT_MSG_NEWFLOWTABLE);
+			}
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELFLOWTABLE:
@@ -7558,7 +7625,10 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_NEWFLOWTABLE:
-		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		if (nft_trans_flowtable_update(trans))
+			nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+		else
+			nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
 		break;
 	}
 	kfree(trans);
@@ -7665,10 +7735,15 @@ static int __nf_tables_abort(struct net *net, bool autoload)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWFLOWTABLE:
-			trans->ctx.table->use--;
-			list_del_rcu(&nft_trans_flowtable(trans)->list);
-			nft_unregister_flowtable_net_hooks(net,
-					&nft_trans_flowtable(trans)->hook_list);
+			if (nft_trans_flowtable_update(trans)) {
+				nft_unregister_flowtable_net_hooks(net,
+						&nft_trans_flowtable_hooks(trans));
+			} else {
+				trans->ctx.table->use--;
+				list_del_rcu(&nft_trans_flowtable(trans)->list);
+				nft_unregister_flowtable_net_hooks(net,
+						&nft_trans_flowtable(trans)->hook_list);
+			}
 			break;
 		case NFT_MSG_DELFLOWTABLE:
 			trans->ctx.table->use++;
-- 
cgit v1.2.3


From abadb2f865d72a223d691fc68e006943ecadf0d9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 13:46:51 +0200
Subject: netfilter: nf_tables: delete devices from flowtable

This patch allows users to delete devices from existing flowtables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   1 +
 net/netfilter/nf_tables_api.c     | 113 ++++++++++++++++++++++++++++++++------
 2 files changed, 98 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4f58c4411bb4..6f0f6fca9ac3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1002,6 +1002,7 @@ struct nft_stats {
 
 struct nft_hook {
 	struct list_head	list;
+	bool			inactive;
 	struct nf_hook_ops	ops;
 	struct rcu_head		rcu;
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 98f2cbb97e39..1c2c3bb78fa0 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1669,6 +1669,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
 		goto err_hook_dev;
 	}
 	hook->ops.dev = dev;
+	hook->inactive = false;
 
 	return hook;
 
@@ -1678,17 +1679,17 @@ err_hook_alloc:
 	return ERR_PTR(err);
 }
 
-static bool nft_hook_list_find(struct list_head *hook_list,
-			       const struct nft_hook *this)
+static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
+					   const struct nft_hook *this)
 {
 	struct nft_hook *hook;
 
 	list_for_each_entry(hook, hook_list, list) {
 		if (this->ops.dev == hook->ops.dev)
-			return true;
+			return hook;
 	}
 
-	return false;
+	return NULL;
 }
 
 static int nf_tables_parse_netdev_hooks(struct net *net,
@@ -6530,6 +6531,51 @@ err1:
 	return err;
 }
 
+static int nft_delflowtable_hook(struct nft_ctx *ctx,
+				 struct nft_flowtable *flowtable)
+{
+	const struct nlattr * const *nla = ctx->nla;
+	struct nft_flowtable_hook flowtable_hook;
+	struct nft_hook *this, *next, *hook;
+	struct nft_trans *trans;
+	int err;
+
+	err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
+				       &flowtable_hook, &flowtable->data);
+	if (err < 0)
+		return err;
+
+	list_for_each_entry_safe(this, next, &flowtable_hook.list, list) {
+		hook = nft_hook_list_find(&flowtable->hook_list, this);
+		if (!hook) {
+			err = -ENOENT;
+			goto err_flowtable_del_hook;
+		}
+		hook->inactive = true;
+		list_del(&this->list);
+		kfree(this);
+	}
+
+	trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
+				sizeof(struct nft_trans_flowtable));
+	if (!trans)
+		return -ENOMEM;
+
+	nft_trans_flowtable(trans) = flowtable;
+	nft_trans_flowtable_update(trans) = true;
+	INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+
+	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+	return 0;
+
+err_flowtable_del_hook:
+	list_for_each_entry(hook, &flowtable_hook.list, list)
+		hook->inactive = false;
+
+	return err;
+}
+
 static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 				  struct sk_buff *skb,
 				  const struct nlmsghdr *nlh,
@@ -6568,13 +6614,17 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 		NL_SET_BAD_ATTR(extack, attr);
 		return PTR_ERR(flowtable);
 	}
+
+	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+
+	if (nla[NFTA_FLOWTABLE_HOOK])
+		return nft_delflowtable_hook(&ctx, flowtable);
+
 	if (flowtable->use > 0) {
 		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
-
 	return nft_delflowtable(&ctx, flowtable);
 }
 
@@ -7184,7 +7234,10 @@ static void nft_commit_release(struct nft_trans *trans)
 		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_DELFLOWTABLE:
-		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+		if (nft_trans_flowtable_update(trans))
+			nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+		else
+			nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
 		break;
 	}
 
@@ -7345,6 +7398,17 @@ static void nft_chain_del(struct nft_chain *chain)
 	list_del_rcu(&chain->list);
 }
 
+static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable,
+				    struct list_head *hook_list)
+{
+	struct nft_hook *hook, *next;
+
+	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+		if (hook->inactive)
+			list_move(&hook->list, hook_list);
+	}
+}
+
 static void nf_tables_module_autoload_cleanup(struct net *net)
 {
 	struct nft_module_request *req, *next;
@@ -7570,13 +7634,24 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELFLOWTABLE:
-			list_del_rcu(&nft_trans_flowtable(trans)->list);
-			nf_tables_flowtable_notify(&trans->ctx,
-						   nft_trans_flowtable(trans),
-						   &nft_trans_flowtable(trans)->hook_list,
-						   NFT_MSG_DELFLOWTABLE);
-			nft_unregister_flowtable_net_hooks(net,
-					&nft_trans_flowtable(trans)->hook_list);
+			if (nft_trans_flowtable_update(trans)) {
+				nft_flowtable_hooks_del(nft_trans_flowtable(trans),
+							&nft_trans_flowtable_hooks(trans));
+				nf_tables_flowtable_notify(&trans->ctx,
+							   nft_trans_flowtable(trans),
+							   &nft_trans_flowtable_hooks(trans),
+							   NFT_MSG_DELFLOWTABLE);
+				nft_unregister_flowtable_net_hooks(net,
+								   &nft_trans_flowtable_hooks(trans));
+			} else {
+				list_del_rcu(&nft_trans_flowtable(trans)->list);
+				nf_tables_flowtable_notify(&trans->ctx,
+							   nft_trans_flowtable(trans),
+							   &nft_trans_flowtable(trans)->hook_list,
+							   NFT_MSG_DELFLOWTABLE);
+				nft_unregister_flowtable_net_hooks(net,
+						&nft_trans_flowtable(trans)->hook_list);
+			}
 			break;
 		}
 	}
@@ -7638,6 +7713,7 @@ static int __nf_tables_abort(struct net *net, bool autoload)
 {
 	struct nft_trans *trans, *next;
 	struct nft_trans_elem *te;
+	struct nft_hook *hook;
 
 	list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
 					 list) {
@@ -7746,8 +7822,13 @@ static int __nf_tables_abort(struct net *net, bool autoload)
 			}
 			break;
 		case NFT_MSG_DELFLOWTABLE:
-			trans->ctx.table->use++;
-			nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+			if (nft_trans_flowtable_update(trans)) {
+				list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list)
+					hook->inactive = false;
+			} else {
+				trans->ctx.table->use++;
+				nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+			}
 			nft_trans_destroy(trans);
 			break;
 		}
-- 
cgit v1.2.3


From 05abe4456fa376040f6cc3cc6830d2e328723478 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 May 2020 15:44:37 +0200
Subject: netfilter: nf_tables: allow to register flowtable with no devices

A flowtable might be composed of dynamic interfaces only. Such dynamic
interfaces might show up at a later stage. This patch allows users to
register a flowtable with no devices. Once the dynamic interface becomes
available, the user adds the dynamic devices to the flowtable.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1c2c3bb78fa0..897ac5fbe079 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1724,8 +1724,6 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
 			goto err_hook;
 		}
 	}
-	if (!n)
-		return -EINVAL;
 
 	return 0;
 
@@ -1762,6 +1760,9 @@ static int nft_chain_parse_netdev(struct net *net,
 						   hook_list);
 		if (err < 0)
 			return err;
+
+		if (list_empty(hook_list))
+			return -EINVAL;
 	} else {
 		return -EINVAL;
 	}
@@ -6209,8 +6210,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 		return err;
 
 	if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
-	    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] ||
-	    !tb[NFTA_FLOWTABLE_HOOK_DEVS])
+	    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
 		return -EINVAL;
 
 	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
@@ -6219,11 +6219,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 
 	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
 
-	err = nf_tables_parse_netdev_hooks(ctx->net,
-					   tb[NFTA_FLOWTABLE_HOOK_DEVS],
-					   &flowtable_hook->list);
-	if (err < 0)
-		return err;
+	if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
+		err = nf_tables_parse_netdev_hooks(ctx->net,
+						   tb[NFTA_FLOWTABLE_HOOK_DEVS],
+						   &flowtable_hook->list);
+		if (err < 0)
+			return err;
+	}
 
 	flowtable_hook->priority	= priority;
 	flowtable_hook->num		= hooknum;
-- 
cgit v1.2.3


From 5b6743fb2c2a1fcb31c8b227558f537095dbece4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 23 May 2020 12:05:22 +0200
Subject: netfilter: nf_tables: skip flowtable hooknum and priority on device
 updates

On device updates, the hooknum and priority attributes are not required.
This patch makes optional these two netlink attributes.

Moreover, bail out with EOPNOTSUPP if userspace tries to update the
hooknum and priority for existing flowtables.

While at this, turn EINVAL into EOPNOTSUPP in case the hooknum is not
ingress. EINVAL is reserved for missing netlink attribute / malformed
netlink messages.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 53 ++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 897ac5fbe079..073aa1051d43 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6195,7 +6195,7 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX
 static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 				    const struct nlattr *attr,
 				    struct nft_flowtable_hook *flowtable_hook,
-				    struct nf_flowtable *ft)
+				    struct nft_flowtable *flowtable, bool add)
 {
 	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
 	struct nft_hook *hook;
@@ -6209,15 +6209,35 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
-	    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
-		return -EINVAL;
+	if (add) {
+		if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
+		    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
+			return -EINVAL;
 
-	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
-	if (hooknum != NF_NETDEV_INGRESS)
-		return -EINVAL;
+		hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+		if (hooknum != NF_NETDEV_INGRESS)
+			return -EOPNOTSUPP;
+
+		priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+
+		flowtable_hook->priority	= priority;
+		flowtable_hook->num		= hooknum;
+	} else {
+		if (tb[NFTA_FLOWTABLE_HOOK_NUM]) {
+			hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+			if (hooknum != flowtable->hooknum)
+				return -EOPNOTSUPP;
+		}
+
+		if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+			priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+			if (priority != flowtable->data.priority)
+				return -EOPNOTSUPP;
+		}
 
-	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+		flowtable_hook->priority	= flowtable->data.priority;
+		flowtable_hook->num		= flowtable->hooknum;
+	}
 
 	if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
 		err = nf_tables_parse_netdev_hooks(ctx->net,
@@ -6227,15 +6247,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 			return err;
 	}
 
-	flowtable_hook->priority	= priority;
-	flowtable_hook->num		= hooknum;
-
 	list_for_each_entry(hook, &flowtable_hook->list, list) {
 		hook->ops.pf		= NFPROTO_NETDEV;
-		hook->ops.hooknum	= hooknum;
-		hook->ops.priority	= priority;
-		hook->ops.priv		= ft;
-		hook->ops.hook		= ft->type->hook;
+		hook->ops.hooknum	= flowtable_hook->num;
+		hook->ops.priority	= flowtable_hook->priority;
+		hook->ops.priv		= &flowtable->data;
+		hook->ops.hook		= flowtable->data.type->hook;
 	}
 
 	return err;
@@ -6363,7 +6380,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 	int err;
 
 	err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
-				       &flowtable_hook, &flowtable->data);
+				       &flowtable_hook, flowtable, false);
 	if (err < 0)
 		return err;
 
@@ -6492,7 +6509,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		goto err3;
 
 	err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
-				       &flowtable_hook, &flowtable->data);
+				       &flowtable_hook, flowtable, true);
 	if (err < 0)
 		goto err4;
 
@@ -6543,7 +6560,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
 	int err;
 
 	err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
-				       &flowtable_hook, &flowtable->data);
+				       &flowtable_hook, flowtable, false);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From f745664257b62f6ba29f45fd21fbe7193b4da57b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 May 2020 19:48:49 -0700
Subject: tcp: add tcp_ld_RTO_revert() helper

RFC 6069 logic has been implemented for IPv4 only so far,
right in the middle of tcp_v4_err() and was error prone.

Move this code to one helper, to make tcp_v4_err() more
readable and to eventually expand RFC 6069 to IPv6 in
the future.

Also perform sock_owned_by_user() check a bit sooner.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 85 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6789671f0f5a..f32dcadc91b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -403,6 +403,45 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 }
 EXPORT_SYMBOL(tcp_req_err);
 
+/* TCP-LD (RFC 6069) logic */
+static void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	s32 remaining;
+	u32 delta_us;
+
+	if (sock_owned_by_user(sk))
+		return;
+
+	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
+	    !icsk->icsk_backoff)
+		return;
+
+	skb = tcp_rtx_queue_head(sk);
+	if (WARN_ON_ONCE(!skb))
+		return;
+
+	icsk->icsk_backoff--;
+	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
+	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+	tcp_mstamp_refresh(tp);
+	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
+	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
+
+	if (remaining > 0) {
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  remaining, TCP_RTO_MAX);
+	} else {
+		/* RTO revert clocked out retransmission.
+		 * Will retransmit now.
+		 */
+		tcp_retransmit_timer(sk);
+	}
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -423,17 +462,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 {
 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
-	struct inet_connection_sock *icsk;
 	struct tcp_sock *tp;
 	struct inet_sock *inet;
 	const int type = icmp_hdr(icmp_skb)->type;
 	const int code = icmp_hdr(icmp_skb)->code;
 	struct sock *sk;
-	struct sk_buff *skb;
 	struct request_sock *fastopen;
 	u32 seq, snd_una;
-	s32 remaining;
-	u32 delta_us;
 	int err;
 	struct net *net = dev_net(icmp_skb->dev);
 
@@ -476,7 +511,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		goto out;
 	}
 
-	icsk = inet_csk(sk);
 	tp = tcp_sk(sk);
 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 	fastopen = rcu_dereference(tp->fastopen_rsk);
@@ -521,41 +555,12 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		}
 
 		err = icmp_err_convert[code].errno;
-		/* check if icmp_skb allows revert of backoff
-		 * (see draft-zimmermann-tcp-lcd) */
-		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
-			break;
-		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
-		    !icsk->icsk_backoff || fastopen)
-			break;
-
-		if (sock_owned_by_user(sk))
-			break;
-
-		skb = tcp_rtx_queue_head(sk);
-		if (WARN_ON_ONCE(!skb))
-			break;
-
-		icsk->icsk_backoff--;
-		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
-					       TCP_TIMEOUT_INIT;
-		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
-
-
-		tcp_mstamp_refresh(tp);
-		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
-		remaining = icsk->icsk_rto -
-			    usecs_to_jiffies(delta_us);
-
-		if (remaining > 0) {
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-						  remaining, TCP_RTO_MAX);
-		} else {
-			/* RTO revert clocked out retransmission.
-			 * Will retransmit now */
-			tcp_retransmit_timer(sk);
-		}
-
+		/* check if this ICMP message allows revert of backoff.
+		 * (see RFC 6069)
+		 */
+		if (!fastopen &&
+		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
+			tcp_ld_RTO_revert(sk, seq);
 		break;
 	case ICMP_TIME_EXCEEDED:
 		err = EHOSTUNREACH;
-- 
cgit v1.2.3


From a12daf13a4492bb7fb26231e52afb381927f938e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 May 2020 19:48:50 -0700
Subject: tcp: rename tcp_v4_err() skb parameter

This essentially reverts 4d1a2d9ec1c1 ("Revert Backoff [v3]:
Rename skb to icmp_skb in tcp_v4_err()")

Now we have tcp_ld_RTO_revert() helper, we can use the usual
name for sk_buff parameter, so that tcp_v4_err() and
tcp_v6_err() use similar names.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f32dcadc91b7..4eef5b84fff1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -458,23 +458,23 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
  *
  */
 
-int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *skb, u32 info)
 {
-	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
-	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 	struct tcp_sock *tp;
 	struct inet_sock *inet;
-	const int type = icmp_hdr(icmp_skb)->type;
-	const int code = icmp_hdr(icmp_skb)->code;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
 	struct sock *sk;
 	struct request_sock *fastopen;
 	u32 seq, snd_una;
 	int err;
-	struct net *net = dev_net(icmp_skb->dev);
+	struct net *net = dev_net(skb->dev);
 
 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 				       th->dest, iph->saddr, ntohs(th->source),
-				       inet_iif(icmp_skb), 0);
+				       inet_iif(skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 		return -ENOENT;
@@ -524,7 +524,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	switch (type) {
 	case ICMP_REDIRECT:
 		if (!sock_owned_by_user(sk))
-			do_redirect(icmp_skb, sk);
+			do_redirect(skb, sk);
 		goto out;
 	case ICMP_SOURCE_QUENCH:
 		/* Just silently ignore these. */
@@ -578,7 +578,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		if (fastopen && !fastopen->sk)
 			break;
 
-		ip_icmp_error(sk, icmp_skb, err, th->dest, info, (u8 *)th);
+		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
-- 
cgit v1.2.3


From 4909daba37846317ec7dcba16fba009636f7fe21 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 26 May 2020 21:35:23 -0700
Subject: net_sched: use qdisc_reset() in qdisc_destroy()

qdisc_destroy() calls ops->reset() and cleans up qdisc->gso_skb
and qdisc->skb_bad_txq, these are nearly same with qdisc_reset(),
so just call it directly, and cosolidate the code for the next
patch.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ebc55d884247..7a0b06001e48 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -949,7 +949,6 @@ static void qdisc_free_cb(struct rcu_head *head)
 static void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
-	struct sk_buff *skb, *tmp;
 
 #ifdef CONFIG_NET_SCHED
 	qdisc_hash_del(qdisc);
@@ -957,24 +956,15 @@ static void qdisc_destroy(struct Qdisc *qdisc)
 	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
 	gen_kill_estimator(&qdisc->rate_est);
-	if (ops->reset)
-		ops->reset(qdisc);
+
+	qdisc_reset(qdisc);
+
 	if (ops->destroy)
 		ops->destroy(qdisc);
 
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
-		__skb_unlink(skb, &qdisc->gso_skb);
-		kfree_skb_list(skb);
-	}
-
-	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
-		__skb_unlink(skb, &qdisc->skb_bad_txq);
-		kfree_skb_list(skb);
-	}
-
 	call_rcu(&qdisc->rcu, qdisc_free_cb);
 }
 
-- 
cgit v1.2.3


From a34dac0b9055202cf9c64e08d8d8dc5e23029d3a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 26 May 2020 21:35:24 -0700
Subject: net_sched: add tracepoints for qdisc_reset() and qdisc_destroy()

Add two tracepoints for qdisc_reset() and qdisc_destroy() to track
qdisc resetting and destroying.

Sample output:

  tc-756   [000] ...3   138.355662: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0
  tc-756   [000] ...1   138.355720: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0
  tc-756   [000] ...1   138.355867: qdisc_reset: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0
  tc-756   [000] ...1   138.355930: qdisc_destroy: dev=ens3 kind=pfifo_fast parent=ffff:ffff handle=0:0
  tc-757   [000] ...2   143.073780: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0
  tc-757   [000] ...1   143.073878: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0
  tc-757   [000] ...1   143.074114: qdisc_reset: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0
  tc-757   [000] ...1   143.074228: qdisc_destroy: dev=ens3 kind=fq_codel parent=ffff:ffff handle=8001:0

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 52 ++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_generic.c      |  4 ++++
 2 files changed, 56 insertions(+)

(limited to 'net')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 0d1a9ebf55ba..2b948801afa3 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -8,6 +8,8 @@
 #include <linux/netdevice.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
 
 TRACE_EVENT(qdisc_dequeue,
 
@@ -44,6 +46,56 @@ TRACE_EVENT(qdisc_dequeue,
 		  __entry->txq_state, __entry->packets, __entry->skbaddr )
 );
 
+TRACE_EVENT(qdisc_reset,
+
+	TP_PROTO(struct Qdisc *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__string(	dev,		qdisc_dev(q)	)
+		__string(	kind,		q->ops->id	)
+		__field(	u32,		parent		)
+		__field(	u32,		handle		)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev, qdisc_dev(q));
+		__assign_str(kind, q->ops->id);
+		__entry->parent = q->parent;
+		__entry->handle = q->handle;
+	),
+
+	TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
+		  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
+		  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
+);
+
+TRACE_EVENT(qdisc_destroy,
+
+	TP_PROTO(struct Qdisc *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__string(	dev,		qdisc_dev(q)	)
+		__string(	kind,		q->ops->id	)
+		__field(	u32,		parent		)
+		__field(	u32,		handle		)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev, qdisc_dev(q));
+		__assign_str(kind, q->ops->id);
+		__entry->parent = q->parent;
+		__entry->handle = q->handle;
+	),
+
+	TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev),
+		  __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent),
+		  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
+);
+
 #endif /* _TRACE_QDISC_H */
 
 /* This part must be outside protection */
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7a0b06001e48..abaa446ed01a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -911,6 +911,8 @@ void qdisc_reset(struct Qdisc *qdisc)
 	const struct Qdisc_ops *ops = qdisc->ops;
 	struct sk_buff *skb, *tmp;
 
+	trace_qdisc_reset(qdisc);
+
 	if (ops->reset)
 		ops->reset(qdisc);
 
@@ -965,6 +967,8 @@ static void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
+	trace_qdisc_destroy(qdisc);
+
 	call_rcu(&qdisc->rcu, qdisc_free_cb);
 }
 
-- 
cgit v1.2.3


From f5a7833e83628f18c1ee94e6ffcb1d232f029be9 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 26 May 2020 21:35:25 -0700
Subject: net_sched: add a tracepoint for qdisc creation

With this tracepoint, we could know when qdisc's are created,
especially those default qdisc's.

Sample output:

  tc-736   [001] ...1    56.230107: qdisc_create: dev=ens3 kind=pfifo parent=1:0
  tc-736   [001] ...1    56.230113: qdisc_create: dev=ens3 kind=hfsc parent=ffff:ffff
  tc-738   [001] ...1    56.256816: qdisc_create: dev=ens3 kind=pfifo parent=1:100
  tc-739   [001] ...1    56.267584: qdisc_create: dev=ens3 kind=pfifo parent=1:200
  tc-740   [001] ...1    56.279649: qdisc_create: dev=ens3 kind=fq_codel parent=1:100
  tc-741   [001] ...1    56.289996: qdisc_create: dev=ens3 kind=pfifo_fast parent=1:200
  tc-745   [000] .N.1   111.687483: qdisc_create: dev=ens3 kind=ingress parent=ffff:fff1

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 23 +++++++++++++++++++++++
 net/sched/sch_api.c          |  3 +++
 net/sched/sch_generic.c      |  4 +++-
 3 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 2b948801afa3..330d32d84485 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -96,6 +96,29 @@ TRACE_EVENT(qdisc_destroy,
 		  TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle))
 );
 
+TRACE_EVENT(qdisc_create,
+
+	TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent),
+
+	TP_ARGS(ops, dev, parent),
+
+	TP_STRUCT__entry(
+		__string(	dev,		dev->name	)
+		__string(	kind,		ops->id		)
+		__field(	u32,		parent		)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev, dev->name);
+		__assign_str(kind, ops->id);
+		__entry->parent = parent;
+	),
+
+	TP_printk("dev=%s kind=%s parent=%x:%x",
+		  __get_str(dev), __get_str(kind),
+		  TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent))
+);
+
 #endif /* _TRACE_QDISC_H */
 
 /* This part must be outside protection */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 0d99df1e764d..9a3449b56bd6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -32,6 +32,8 @@
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 
+#include <trace/events/qdisc.h>
+
 /*
 
    Short review.
@@ -1283,6 +1285,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	}
 
 	qdisc_hash_add(sch, false);
+	trace_qdisc_create(ops, dev, parent);
 
 	return sch;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index abaa446ed01a..a4271e47f220 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -896,8 +896,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 	}
 	sch->parent = parentid;
 
-	if (!ops->init || ops->init(sch, NULL, extack) == 0)
+	if (!ops->init || ops->init(sch, NULL, extack) == 0) {
+		trace_qdisc_create(ops, dev_queue->dev, parentid);
 		return sch;
+	}
 
 	qdisc_put(sch);
 	return NULL;
-- 
cgit v1.2.3


From 70f50965338a12e17454c31ad5ece27069719358 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 26 May 2020 21:35:26 -0700
Subject: net_sched: avoid resetting active qdisc for multiple times
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Except for sch_mq and sch_mqprio, each dev queue points to the
same root qdisc, so when we reset the dev queues with
netdev_for_each_tx_queue() we end up resetting the same instance
of the root qdisc for multiple times.

Avoid this by checking the __QDISC_STATE_DEACTIVATED bit in
each iteration, so for sch_mq/sch_mqprio, we still reset all
of them like before, for the rest, we only reset it once.

Reported-by: Václav Zindulka <vaclav.zindulka@tlapnet.cz>
Tested-by: Václav Zindulka <vaclav.zindulka@tlapnet.cz>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a4271e47f220..d13e27467470 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1128,6 +1128,28 @@ void dev_activate(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_activate);
 
+static void qdisc_deactivate(struct Qdisc *qdisc)
+{
+	bool nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+	if (qdisc->flags & TCQ_F_BUILTIN)
+		return;
+	if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state))
+		return;
+
+	if (nolock)
+		spin_lock_bh(&qdisc->seqlock);
+	spin_lock_bh(qdisc_lock(qdisc));
+
+	set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+
+	qdisc_reset(qdisc);
+
+	spin_unlock_bh(qdisc_lock(qdisc));
+	if (nolock)
+		spin_unlock_bh(&qdisc->seqlock);
+}
+
 static void dev_deactivate_queue(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
 				 void *_qdisc_default)
@@ -1137,21 +1159,8 @@ static void dev_deactivate_queue(struct net_device *dev,
 
 	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
-		bool nolock = qdisc->flags & TCQ_F_NOLOCK;
-
-		if (nolock)
-			spin_lock_bh(&qdisc->seqlock);
-		spin_lock_bh(qdisc_lock(qdisc));
-
-		if (!(qdisc->flags & TCQ_F_BUILTIN))
-			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
-
+		qdisc_deactivate(qdisc);
 		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
-		qdisc_reset(qdisc);
-
-		spin_unlock_bh(qdisc_lock(qdisc));
-		if (nolock)
-			spin_unlock_bh(&qdisc->seqlock);
 	}
 }
 
-- 
cgit v1.2.3


From 759ae57f1b7bf5dfea1816f786f568d69efe34e2 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 26 May 2020 21:35:27 -0700
Subject: net_sched: get rid of unnecessary dev_qdisc_reset()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resetting old qdisc on dev_queue->qdisc_sleeping in
dev_qdisc_reset() is redundant, because this qdisc,
even if not same with dev_queue->qdisc, is reset via
qdisc_put() right after calling dev_graft_qdisc() when
hitting refcnt 0.

This is very easy to observe with qdisc_reset() tracepoint
and stack traces.

Reported-by: Václav Zindulka <vaclav.zindulka@tlapnet.cz>
Tested-by: Václav Zindulka <vaclav.zindulka@tlapnet.cz>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index d13e27467470..b19a0021a0bd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1191,16 +1191,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 	return false;
 }
 
-static void dev_qdisc_reset(struct net_device *dev,
-			    struct netdev_queue *dev_queue,
-			    void *none)
-{
-	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
-
-	if (qdisc)
-		qdisc_reset(qdisc);
-}
-
 /**
  * 	dev_deactivate_many - deactivate transmissions on several devices
  * 	@head: list of devices to deactivate
@@ -1237,12 +1227,6 @@ void dev_deactivate_many(struct list_head *head)
 			 */
 			schedule_timeout_uninterruptible(1);
 		}
-		/* The new qdisc is assigned at this point so we can safely
-		 * unwind stale skb lists and qdisc statistics
-		 */
-		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
-		if (dev_ingress_queue(dev))
-			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
 	}
 }
 
-- 
cgit v1.2.3


From b3ae2459f89773adcbf16fef4b68deaaa3be1929 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 27 May 2020 12:25:26 +0300
Subject: net/tls: Add force_resync for driver resync

This patch adds a field to the tls rx offload context which enables
drivers to force a send_resync call.

This field can be used by drivers to request a resync at the next
possible tls record. It is beneficial for hardware that provides the
resync sequence number asynchronously. In such cases, the packet that
triggered the resync does not contain the information required for a
resync. Instead, the driver requests resync for all the following
TLS record until the asynchronous notification with the resync request
TCP sequence arrives.

A following series for mlx5e ConnectX-6DX TLS RX offload support will
use this mechanism.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 12 +++++++++++-
 net/tls/tls_device.c |  9 ++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tls.h b/include/net/tls.h
index bf9eb4823933..cf9ec152fbb7 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -594,12 +594,22 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction)
 #endif
 
 /* The TLS context is valid until sk_destruct is called */
+#define RESYNC_REQ (1 << 0)
+#define RESYNC_REQ_FORCE (1 << 1)
 static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
 
-	atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1);
+	atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ);
+}
+
+static inline void tls_offload_rx_force_resync_request(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
+
+	atomic64_set(&rx_ctx->resync_req, RESYNC_REQ | RESYNC_REQ_FORCE);
 }
 
 static inline void
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a562ebaaa33c..0e55f8365ce2 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -694,10 +694,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx;
+	bool is_req_pending, is_force_resync;
 	u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
-	u32 sock_data, is_req_pending;
 	struct tls_prot_info *prot;
 	s64 resync_req;
+	u32 sock_data;
 	u32 req_seq;
 
 	if (tls_ctx->rx_conf != TLS_HW)
@@ -712,9 +713,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 		resync_req = atomic64_read(&rx_ctx->resync_req);
 		req_seq = resync_req >> 32;
 		seq += TLS_HEADER_SIZE - 1;
-		is_req_pending = resync_req;
+		is_req_pending = resync_req & RESYNC_REQ;
+		is_force_resync = resync_req & RESYNC_REQ_FORCE;
 
-		if (likely(!is_req_pending) || req_seq != seq ||
+		if (likely(!is_req_pending) ||
+		    (!is_force_resync && req_seq != seq) ||
 		    !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
 			return;
 		break;
-- 
cgit v1.2.3


From 50ce4c099bebf56be86c9448f7f4bcd34f33663c Mon Sep 17 00:00:00 2001
From: Jonas Falkevik <jonas.falkevik@gmail.com>
Date: Wed, 27 May 2020 11:59:43 +0200
Subject: sctp: fix typo sctp_ulpevent_nofity_peer_addr_change

change typo in function name "nofity" to "notify"
sctp_ulpevent_nofity_peer_addr_change ->
sctp_ulpevent_notify_peer_addr_change

Signed-off-by: Jonas Falkevik <jonas.falkevik@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h | 2 +-
 net/sctp/associola.c        | 8 ++++----
 net/sctp/ulpevent.c         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 0b032b92da0b..994e984eef32 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -80,7 +80,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
 	struct sctp_chunk *chunk,
 	gfp_t gfp);
 
-void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,
+void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport,
 					   int state, int error);
 
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 437079a4883d..72315137d7e7 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -432,7 +432,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
 		changeover = 1 ;
 
 	asoc->peer.primary_path = transport;
-	sctp_ulpevent_nofity_peer_addr_change(transport,
+	sctp_ulpevent_notify_peer_addr_change(transport,
 					      SCTP_ADDR_MADE_PRIM, 0);
 
 	/* Set a default msg_name for events. */
@@ -574,7 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 
 	asoc->peer.transport_count--;
 
-	sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0);
+	sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0);
 	sctp_transport_free(peer);
 }
 
@@ -714,7 +714,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
 	asoc->peer.transport_count++;
 
-	sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0);
+	sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0);
 
 	/* If we do not yet have a primary path, set one.  */
 	if (!asoc->peer.primary_path) {
@@ -840,7 +840,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	 * to the user.
 	 */
 	if (ulp_notify)
-		sctp_ulpevent_nofity_peer_addr_change(transport,
+		sctp_ulpevent_notify_peer_addr_change(transport,
 						      spc_state, error);
 
 	/* Select new active and retran paths. */
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index c82dbdcf13f2..f0640306e77f 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -336,7 +336,7 @@ fail:
 	return NULL;
 }
 
-void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,
+void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport,
 					   int state, int error)
 {
 	struct sctp_association *asoc = transport->asoc;
-- 
cgit v1.2.3


From 7a15b2e013f535a125ad7351ffc808c79bc6de35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 May 2020 20:22:29 +0200
Subject: net: remove kernel_getsockopt

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  2 --
 net/socket.c        | 34 ----------------------------------
 2 files changed, 36 deletions(-)

(limited to 'net')

diff --git a/include/linux/net.h b/include/linux/net.h
index 6451425e828f..74ef5d7315f7 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 		   int flags);
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr);
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr);
-int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval,
-		      int *optlen);
 int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval,
 		      unsigned int optlen);
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
diff --git a/net/socket.c b/net/socket.c
index 80422fc3c836..81a98b6cbd08 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3624,40 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(kernel_getpeername);
 
-/**
- *	kernel_getsockopt - get a socket option (kernel space)
- *	@sock: socket
- *	@level: API level (SOL_SOCKET, ...)
- *	@optname: option tag
- *	@optval: option value
- *	@optlen: option length
- *
- *	Assigns the option length to @optlen.
- *	Returns 0 or an error.
- */
-
-int kernel_getsockopt(struct socket *sock, int level, int optname,
-			char *optval, int *optlen)
-{
-	mm_segment_t oldfs = get_fs();
-	char __user *uoptval;
-	int __user *uoptlen;
-	int err;
-
-	uoptval = (char __user __force *) optval;
-	uoptlen = (int __user __force *) optlen;
-
-	set_fs(KERNEL_DS);
-	if (level == SOL_SOCKET)
-		err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
-	else
-		err = sock->ops->getsockopt(sock, level, optname, uoptval,
-					    uoptlen);
-	set_fs(oldfs);
-	return err;
-}
-EXPORT_SYMBOL(kernel_getsockopt);
-
 /**
  *	kernel_setsockopt - set a socket option (kernel space)
  *	@sock: socket
-- 
cgit v1.2.3


From a7528198add88a6f51b8ade17a5cf86804b8f7ee Mon Sep 17 00:00:00 2001
From: Markus Theil <markus.theil@tu-ilmenau.de>
Date: Wed, 27 May 2020 18:03:34 +0200
Subject: mac80211: support control port TX status reporting

Add support for TX status reporting for the control port
TX API; this will be used by hostapd when it moves to the
control port TX API.

Signed-off-by: Markus Theil <markus.theil@tu-ilmenau.de>
Link: https://lore.kernel.org/r/20200527160334.19224-1-markus.theil@tu-ilmenau.de
[fix commit message, it was referring to nl80211]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/main.c        |  2 ++
 net/mac80211/status.c      |  9 ++++++-
 net/mac80211/tdls.c        |  2 +-
 net/mac80211/tx.c          | 62 ++++++++++++++++++++++++++++++++--------------
 5 files changed, 56 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b87dc873825b..b7935f3d000d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1783,7 +1783,8 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags,
-				  u32 ctrl_flags);
+				  u32 ctrl_flags,
+				  u64 *cookie);
 void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
 			      struct sk_buff_head *skbs);
 struct sk_buff *
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ac74bd780b42..b4a2efe8e83a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 			      NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
 	wiphy_ext_feature_set(wiphy,
 			      NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH);
+	wiphy_ext_feature_set(wiphy,
+			      NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS);
 	wiphy_ext_feature_set(wiphy,
 			      NL80211_EXT_FEATURE_SCAN_FREQ_KHZ);
 
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 22512805eafb..7b1bacac39c6 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -649,10 +649,17 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 						      info->status.ack_signal,
 						      info->status.is_valid_ack_signal,
 						      GFP_ATOMIC);
-			else
+			else if (ieee80211_is_mgmt(hdr->frame_control))
 				cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
 							skb->data, skb->len,
 							acked, GFP_ATOMIC);
+			else
+				cfg80211_control_port_tx_status(&sdata->wdev,
+								cookie,
+								skb->data,
+								skb->len,
+								acked,
+								GFP_ATOMIC);
 		}
 		rcu_read_unlock();
 
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 8ad420db3766..4b0cff4a07bd 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -1054,7 +1054,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
 
 	/* disable bottom halves when entering the Tx path */
 	local_bh_disable();
-	__ieee80211_subif_start_xmit(skb, dev, flags, 0);
+	__ieee80211_subif_start_xmit(skb, dev, flags, 0, NULL);
 	local_bh_enable();
 
 	return ret;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 5931128e1855..e9ce658141f5 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2436,13 +2436,19 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-static int ieee80211_store_ack_skb(struct ieee80211_local *local,
+static u16 ieee80211_store_ack_skb(struct ieee80211_local *local,
 				   struct sk_buff *skb,
-				   u32 *info_flags)
+				   u32 *info_flags,
+				   u64 *cookie)
 {
-	struct sk_buff *ack_skb = skb_clone_sk(skb);
+	struct sk_buff *ack_skb;
 	u16 info_id = 0;
 
+	if (skb->sk)
+		ack_skb = skb_clone_sk(skb);
+	else
+		ack_skb = skb_clone(skb, GFP_ATOMIC);
+
 	if (ack_skb) {
 		unsigned long flags;
 		int id;
@@ -2455,6 +2461,10 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local,
 		if (id >= 0) {
 			info_id = id;
 			*info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+			if (cookie) {
+				*cookie = ieee80211_mgmt_tx_cookie(local);
+				IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie;
+			}
 		} else {
 			kfree_skb(ack_skb);
 		}
@@ -2484,7 +2494,8 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local,
  */
 static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 					   struct sk_buff *skb, u32 info_flags,
-					   struct sta_info *sta, u32 ctrl_flags)
+					   struct sta_info *sta, u32 ctrl_flags,
+					   u64 *cookie)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_tx_info *info;
@@ -2755,9 +2766,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 		goto free;
 	}
 
-	if (unlikely(!multicast && skb->sk &&
-		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
-		info_id = ieee80211_store_ack_skb(local, skb, &info_flags);
+	if (unlikely(!multicast && ((skb->sk &&
+		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ||
+		     ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS)))
+		info_id = ieee80211_store_ack_skb(local, skb, &info_flags,
+						  cookie);
 
 	/*
 	 * If the skb is shared we need to obtain our own copy.
@@ -3913,7 +3926,8 @@ EXPORT_SYMBOL(ieee80211_txq_schedule_start);
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags,
-				  u32 ctrl_flags)
+				  u32 ctrl_flags,
+				  u64 *cookie)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
@@ -3983,7 +3997,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		skb_mark_not_on_list(skb);
 
 		skb = ieee80211_build_hdr(sdata, skb, info_flags,
-					  sta, ctrl_flags);
+					  sta, ctrl_flags, cookie);
 		if (IS_ERR(skb)) {
 			kfree_skb_list(next);
 			goto out;
@@ -4125,9 +4139,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 		__skb_queue_head_init(&queue);
 		ieee80211_convert_to_unicast(skb, dev, &queue);
 		while ((skb = __skb_dequeue(&queue)))
-			__ieee80211_subif_start_xmit(skb, dev, 0, 0);
+			__ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
 	} else {
-		__ieee80211_subif_start_xmit(skb, dev, 0, 0);
+		__ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL);
 	}
 
 	return NETDEV_TX_OK;
@@ -4215,7 +4229,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 
 	if (unlikely(!multicast && skb->sk &&
 		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
-		ieee80211_store_ack_skb(local, skb, &info->flags);
+		ieee80211_store_ack_skb(local, skb, &info->flags, NULL);
 
 	memset(info, 0, sizeof(*info));
 
@@ -4299,7 +4313,7 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0);
+	skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL);
 	if (IS_ERR(skb))
 		goto out;
 
@@ -5347,7 +5361,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	struct sk_buff *skb;
 	struct ethhdr *ehdr;
 	u32 ctrl_flags = 0;
-	u32 flags;
+	u32 flags = 0;
 
 	/* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE
 	 * or Pre-Authentication
@@ -5360,9 +5374,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 		ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
 
 	if (unencrypted)
-		flags = IEEE80211_TX_INTFL_DONT_ENCRYPT;
-	else
-		flags = 0;
+		flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+	if (cookie)
+		ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+	flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+		 IEEE80211_TX_CTL_INJECTED;
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
 			    sizeof(struct ethhdr) + len);
@@ -5383,10 +5401,15 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	skb_reset_network_header(skb);
 	skb_reset_mac_header(skb);
 
+	/* mutex lock is only needed for incrementing the cookie counter */
+	mutex_lock(&local->mtx);
+
 	local_bh_disable();
-	__ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags);
+	__ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie);
 	local_bh_enable();
 
+	mutex_unlock(&local->mtx);
+
 	return 0;
 }
 
@@ -5413,7 +5436,8 @@ int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
 
 	local_bh_disable();
 	__ieee80211_subif_start_xmit(skb, skb->dev, 0,
-				     IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP);
+				     IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP,
+				     NULL);
 	local_bh_enable();
 
 	return 0;
-- 
cgit v1.2.3


From d29245692a44d71d5e2e0770463184a693696232 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 May 2020 17:34:58 -0700
Subject: tcp: ipv6: support RFC 6069 (TCP-LD)

Make tcp_ld_RTO_revert() helper available to IPv6, and
implement RFC 6069 :

Quoting this RFC :

3. Connectivity Disruption Indication

   For Internet Protocol version 6 (IPv6) [RFC2460], the counterpart of
   the ICMP destination unreachable message of code 0 (net unreachable)
   and of code 1 (host unreachable) is the ICMPv6 destination
   unreachable message of code 0 (no route to destination) [RFC4443].
   As with IPv4, a router should generate an ICMPv6 destination
   unreachable message of code 0 in response to a packet that cannot be
   delivered to its destination address because it lacks a matching
   entry in its routing table.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 1 +
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b681338a8320..66e4b8331850 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -437,6 +437,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
 void tcp_v4_mtu_reduced(struct sock *sk);
 void tcp_req_err(struct sock *sk, u32 seq, bool abort);
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(const struct sock *sk,
 				      struct request_sock *req,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4eef5b84fff1..ad6435ba6d72 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -404,7 +404,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 EXPORT_SYMBOL(tcp_req_err);
 
 /* TCP-LD (RFC 6069) logic */
-static void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -441,6 +441,7 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 		tcp_retransmit_timer(sk);
 	}
 }
+EXPORT_SYMBOL(tcp_ld_RTO_revert);
 
 /*
  * This routine is called by the ICMP module when it gets some
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 01a6f5111a77..b7415ca75c2d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -473,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		} else
 			sk->sk_err_soft = err;
 		goto out;
+	case TCP_LISTEN:
+		break;
+	default:
+		/* check if this ICMP message allows revert of backoff.
+		 * (see RFC 6069)
+		 */
+		if (!fastopen && type == ICMPV6_DEST_UNREACH &&
+		    code == ICMPV6_NOROUTE)
+			tcp_ld_RTO_revert(sk, seq);
 	}
 
 	if (!sock_owned_by_user(sk) && np->recverr) {
-- 
cgit v1.2.3


From b58f0e8f38c0a44afa59601a115bd231f23471e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:09 +0200
Subject: net: add sock_set_reuseaddr

Add a helper to directly set the SO_REUSEADDR sockopt from kernel space
without going through a fake uaccess.

For this the iscsi target now has to formally depend on inet to avoid
a mostly theoretical compile failure.  For actual operation it already
did depend on having ipv4 or ipv6 support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/sw/siw/siw_cm.c        | 18 +++++-------------
 drivers/nvme/target/tcp.c                 |  8 +-------
 drivers/target/iscsi/Kconfig              |  2 +-
 drivers/target/iscsi/iscsi_target_login.c |  9 +--------
 fs/dlm/lowcomms.c                         |  6 +-----
 include/net/sock.h                        |  2 ++
 net/core/sock.c                           |  8 ++++++++
 7 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 559e5fd3bad8..d1860f3e8740 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1312,17 +1312,14 @@ static void siw_cm_llp_state_change(struct sock *sk)
 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 			      struct sockaddr *raddr)
 {
-	int rv, flags = 0, s_val = 1;
+	int rv, flags = 0;
 	size_t size = laddr->sa_family == AF_INET ?
 		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
 
 	/*
 	 * Make address available again asap.
 	 */
-	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
-			       sizeof(s_val));
-	if (rv < 0)
-		return rv;
+	sock_set_reuseaddr(s->sk);
 
 	rv = s->ops->bind(s, laddr, size);
 	if (rv < 0)
@@ -1781,7 +1778,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 	struct siw_cep *cep = NULL;
 	struct siw_device *sdev = to_siw_dev(id->device);
 	int addr_family = id->local_addr.ss_family;
-	int rv = 0, s_val;
+	int rv = 0;
 
 	if (addr_family != AF_INET && addr_family != AF_INET6)
 		return -EAFNOSUPPORT;
@@ -1793,13 +1790,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 	/*
 	 * Allow binding local port when still in TIME_WAIT from last close.
 	 */
-	s_val = 1;
-	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
-			       sizeof(s_val));
-	if (rv) {
-		siw_dbg(id->device, "setsockopt error: %d\n", rv);
-		goto error;
-	}
+	sock_set_reuseaddr(s->sk);
+
 	if (addr_family == AF_INET) {
 		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
 
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index f0da04e960f4..40757a63f455 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1632,6 +1632,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	port->sock->sk->sk_user_data = port;
 	port->data_ready = port->sock->sk->sk_data_ready;
 	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
+	sock_set_reuseaddr(port->sock->sk);
 
 	opt = 1;
 	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
@@ -1641,13 +1642,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 		goto err_sock;
 	}
 
-	ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
-			(char *)&opt, sizeof(opt));
-	if (ret) {
-		pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
-		goto err_sock;
-	}
-
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY,
 				(char *)&so_priority, sizeof(so_priority));
diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig
index 1f93ea381353..922484ea4e30 100644
--- a/drivers/target/iscsi/Kconfig
+++ b/drivers/target/iscsi/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config ISCSI_TARGET
 	tristate "Linux-iSCSI.org iSCSI Target Mode Stack"
-	depends on NET
+	depends on INET
 	select CRYPTO
 	select CRYPTO_CRC32C
 	select CRYPTO_CRC32C_INTEL if X86
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 731ee67fe914..91acb3f07b4c 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -909,14 +909,7 @@ int iscsit_setup_np(
 		}
 	}
 
-	/* FIXME: Someone please explain why this is endian-safe */
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-			(char *)&opt, sizeof(opt));
-	if (ret < 0) {
-		pr_err("kernel_setsockopt() for SO_REUSEADDR"
-			" failed\n");
-		goto fail;
-	}
+	sock_set_reuseaddr(sock->sk);
 
 	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
 			(char *)&opt, sizeof(opt));
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index f13dad0fd9ef..88f2574ca63a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1127,12 +1127,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
 			  sizeof(one));
 
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				   (char *)&one, sizeof(one));
+	sock_set_reuseaddr(sock->sk);
 
-	if (result < 0) {
-		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
-	}
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = con;
 	save_listen_callbacks(sock);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3e8c6d4b4b59..2ec085044790 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2688,4 +2688,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 
+void sock_set_reuseaddr(struct sock *sk);
+
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index fd85e651ce28..18eb84fdf5fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -712,6 +712,14 @@ bool sk_mc_loop(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_mc_loop);
 
+void sock_set_reuseaddr(struct sock *sk)
+{
+	lock_sock(sk);
+	sk->sk_reuse = SK_CAN_REUSE;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
cgit v1.2.3


From c433594c07457d2b2e41a87014bfad9bec279abf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:10 +0200
Subject: net: add sock_no_linger

Add a helper to directly set the SO_LINGER sockopt from kernel space
with onoff set to true and a linger time of 0 without going through a
fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nvme/host/tcp.c   |  9 +--------
 drivers/nvme/target/tcp.c |  6 +-----
 include/net/sock.h        |  1 +
 net/core/sock.c           |  9 +++++++++
 net/rds/tcp.h             |  1 -
 net/rds/tcp_connect.c     |  2 +-
 net/rds/tcp_listen.c      | 13 +------------
 net/sunrpc/svcsock.c      | 12 ++----------
 8 files changed, 16 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c15a92163c1f..e72d87482eb7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1313,7 +1313,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
-	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
 	int ret, opt, rcv_pdu_size;
 
 	queue->ctrl = ctrl;
@@ -1361,13 +1360,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	 * close. This is done to prevent stale data from being sent should
 	 * the network connection be restored before TCP times out.
 	 */
-	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
-			(char *)&sol, sizeof(sol));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set SO_LINGER sock opt %d\n", ret);
-		goto err_sock;
-	}
+	sock_no_linger(queue->sock->sk);
 
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 40757a63f455..e0801494b097 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 {
 	struct socket *sock = queue->sock;
 	struct inet_sock *inet = inet_sk(sock->sk);
-	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
 	int ret;
 
 	ret = kernel_getsockname(sock,
@@ -1447,10 +1446,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	 * close. This is done to prevent stale data from being sent should
 	 * the network connection be restored before TCP times out.
 	 */
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			(char *)&sol, sizeof(sol));
-	if (ret)
-		return ret;
+	sock_no_linger(sock->sk);
 
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
diff --git a/include/net/sock.h b/include/net/sock.h
index 2ec085044790..6ed00bf009bb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 
+void sock_no_linger(struct sock *sk);
 void sock_set_reuseaddr(struct sock *sk);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 18eb84fdf5fb..f0f09524911c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -720,6 +720,15 @@ void sock_set_reuseaddr(struct sock *sk)
 }
 EXPORT_SYMBOL(sock_set_reuseaddr);
 
+void sock_no_linger(struct sock *sk)
+{
+	lock_sock(sk);
+	sk->sk_lingertime = 0;
+	sock_set_flag(sk, SOCK_LINGER);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 3c69361d21c7..d640e210b97b 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -73,7 +73,6 @@ void rds_tcp_listen_data_ready(struct sock *sk);
 int rds_tcp_accept_one(struct socket *sock);
 int rds_tcp_keepalive(struct socket *sock);
 void *rds_tcp_listen_sock_def_readable(struct net *net);
-void rds_tcp_set_linger(struct socket *sock);
 
 /* tcp_recv.c */
 int rds_tcp_recv_init(void);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 008f50fb25dd..4e64598176b0 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
 
 	if (sock) {
 		if (rds_destroy_pending(cp->cp_conn))
-			rds_tcp_set_linger(sock);
+			sock_no_linger(sock->sk);
 		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
 		lock_sock(sock->sk);
 		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 810a3a49e947..bbb31b9c0b39 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -111,17 +111,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 	return NULL;
 }
 
-void rds_tcp_set_linger(struct socket *sock)
-{
-	struct linger no_linger = {
-		.l_onoff = 1,
-		.l_linger = 0,
-	};
-
-	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			  (char *)&no_linger, sizeof(no_linger));
-}
-
 int rds_tcp_accept_one(struct socket *sock)
 {
 	struct socket *new_sock = NULL;
@@ -241,7 +230,7 @@ rst_nsk:
 	 * be pending on it. By setting linger, we achieve the side-effect
 	 * of avoiding TIME_WAIT state on new_sock.
 	 */
-	rds_tcp_set_linger(new_sock);
+	sock_no_linger(new_sock->sk);
 	kernel_sock_shutdown(new_sock, SHUT_RDWR);
 	ret = 0;
 out:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 023514e392b3..6773dacc64d8 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
 
 static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
 {
-	struct svc_sock *svsk;
-	struct socket *sock;
-	struct linger no_linger = {
-		.l_onoff = 1,
-		.l_linger = 0,
-	};
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
 
-	svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	sock = svsk->sk_sock;
-	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			  (char *)&no_linger, sizeof(no_linger));
+	sock_no_linger(svsk->sk_sock->sk);
 }
 
 /*
-- 
cgit v1.2.3


From 6e43496745e75ac49d644df984d2f4ee5b5b6b4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:11 +0200
Subject: net: add sock_set_priority

Add a helper to directly set the SO_PRIORITY sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nvme/host/tcp.c   | 12 ++----------
 drivers/nvme/target/tcp.c | 18 ++++--------------
 include/net/sock.h        |  1 +
 net/core/sock.c           |  8 ++++++++
 4 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index e72d87482eb7..a307972d33a0 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1362,16 +1362,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	 */
 	sock_no_linger(queue->sock->sk);
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret) {
-			dev_err(ctrl->ctrl.device,
-				"failed to set SO_PRIORITY sock opt, ret %d\n",
-				ret);
-			goto err_sock;
-		}
-	}
+	if (so_priority > 0)
+		sock_set_priority(queue->sock->sk, so_priority);
 
 	/* Set socket type of service */
 	if (nctrl->opts->tos >= 0) {
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index e0801494b097..f3088156d01d 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1448,12 +1448,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	 */
 	sock_no_linger(sock->sk);
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret)
-			return ret;
-	}
+	if (so_priority > 0)
+		sock_set_priority(sock->sk, so_priority);
 
 	/* Set socket type of service */
 	if (inet->rcv_tos > 0) {
@@ -1638,14 +1634,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 		goto err_sock;
 	}
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret) {
-			pr_err("failed to set SO_PRIORITY sock opt %d\n", ret);
-			goto err_sock;
-		}
-	}
+	if (so_priority > 0)
+		sock_set_priority(port->sock->sk, so_priority);
 
 	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
 			sizeof(port->addr));
diff --git a/include/net/sock.h b/include/net/sock.h
index 6ed00bf009bb..a3a43141a4be 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 void sock_def_readable(struct sock *sk);
 
 void sock_no_linger(struct sock *sk);
+void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_reuseaddr(struct sock *sk);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index f0f09524911c..ceda1a9248b3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -729,6 +729,14 @@ void sock_no_linger(struct sock *sk)
 }
 EXPORT_SYMBOL(sock_no_linger);
 
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+	lock_sock(sk);
+	sk->sk_priority = priority;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
cgit v1.2.3


From 76ee0785f42afbc0418072b7179d95f450d3c9a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:12 +0200
Subject: net: add sock_set_sndtimeo

Add a helper to directly set the SO_SNDTIMEO_NEW sockopt from kernel
space without going through a fake uaccess.  The interface is
simplified to only pass the seconds value, as that is the only
thing needed at the moment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c  |  8 ++------
 include/net/sock.h |  1 +
 net/core/sock.c    | 11 +++++++++++
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 88f2574ca63a..b79711d0aac7 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -918,7 +918,6 @@ static void sctp_connect_to_sock(struct connection *con)
 	int result;
 	int addr_len;
 	struct socket *sock;
-	struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 };
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
@@ -970,13 +969,10 @@ static void sctp_connect_to_sock(struct connection *con)
 	 * since O_NONBLOCK argument in connect() function does not work here,
 	 * then, we should restore the default value of this attribute.
 	 */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
-			  sizeof(tv));
+	sock_set_sndtimeo(sock->sk, 5);
 	result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
 				   0);
-	memset(&tv, 0, sizeof(tv));
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
-			  sizeof(tv));
+	sock_set_sndtimeo(sock->sk, 0);
 
 	if (result == -EINPROGRESS)
 		result = 0;
diff --git a/include/net/sock.h b/include/net/sock.h
index a3a43141a4be..9a7b9e98685a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2691,5 +2691,6 @@ void sock_def_readable(struct sock *sk);
 void sock_no_linger(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_reuseaddr(struct sock *sk);
+void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index ceda1a9248b3..d3b1d61e4f76 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -737,6 +737,17 @@ void sock_set_priority(struct sock *sk, u32 priority)
 }
 EXPORT_SYMBOL(sock_set_priority);
 
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+	lock_sock(sk);
+	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+		sk->sk_sndtimeo = secs * HZ;
+	else
+		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
cgit v1.2.3


From 7594888c782e735f8a7b110094307a4dbe7b3f03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:13 +0200
Subject: net: add sock_bindtoindex

Add a helper to directly set the SO_BINDTOIFINDEX sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h        |  1 +
 net/core/sock.c           | 21 +++++++++++++++------
 net/ipv4/udp_tunnel.c     |  4 +---
 net/ipv6/ip6_udp_tunnel.c |  4 +---
 4 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9a7b9e98685a..cdec7bc055d5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 
+int sock_bindtoindex(struct sock *sk, int ifindex);
 void sock_no_linger(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_reuseaddr(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index d3b1d61e4f76..23f80880fbb2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);
 
-static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 {
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
@@ -594,6 +594,18 @@ out:
 	return ret;
 }
 
+int sock_bindtoindex(struct sock *sk, int ifindex)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = sock_bindtoindex_locked(sk, ifindex);
+	release_sock(sk);
+
+	return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 				int optlen)
 {
@@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 			goto out;
 	}
 
-	lock_sock(sk);
-	ret = sock_setbindtodevice_locked(sk, index);
-	release_sock(sk);
-
+	return sock_bindtoindex(sk, index);
 out:
 #endif
 
@@ -1216,7 +1225,7 @@ set_rcvbuf:
 		break;
 
 	case SO_BINDTOIFINDEX:
-		ret = sock_setbindtodevice_locked(sk, val);
+		ret = sock_bindtoindex_locked(sk, val);
 		break;
 
 	default:
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 150e6f0fdbf5..2158e8bddf41 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->bind_ifindex) {
-		err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
-					(void *)&cfg->bind_ifindex,
-					sizeof(cfg->bind_ifindex));
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
 		if (err < 0)
 			goto error;
 	}
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 58956a6b66a2..6523609516d2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -33,9 +33,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 			goto error;
 	}
 	if (cfg->bind_ifindex) {
-		err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
-					(void *)&cfg->bind_ifindex,
-					sizeof(cfg->bind_ifindex));
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
 		if (err < 0)
 			goto error;
 	}
-- 
cgit v1.2.3


From 783da70e83967efeacf3c02c9dcfdc2b17bd62eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:14 +0200
Subject: net: add sock_enable_timestamps

Add a helper to directly enable timestamps instead of setting the
SO_TIMESTAMP* sockopts from kernel space and going through a fake
uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h       |  1 +
 net/core/sock.c          | 47 +++++++++++++++++++++++++++++------------------
 net/rxrpc/local_object.c |  8 +-------
 3 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index cdec7bc055d5..99ef43508d2b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 void sock_def_readable(struct sock *sk);
 
 int sock_bindtoindex(struct sock *sk, int ifindex);
+void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_reuseaddr(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 23f80880fbb2..e4a4dd2b3d8b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -757,6 +757,28 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs)
 }
 EXPORT_SYMBOL(sock_set_sndtimeo);
 
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+	if (val)  {
+		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+		sock_set_flag(sk, SOCK_RCVTSTAMP);
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	} else {
+		sock_reset_flag(sk, SOCK_RCVTSTAMP);
+		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
+	}
+}
+
+void sock_enable_timestamps(struct sock *sk)
+{
+	lock_sock(sk);
+	__sock_set_timestamps(sk, true, false, true);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_enable_timestamps);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -948,28 +970,17 @@ set_rcvbuf:
 		break;
 
 	case SO_TIMESTAMP_OLD:
+		__sock_set_timestamps(sk, valbool, false, false);
+		break;
 	case SO_TIMESTAMP_NEW:
+		__sock_set_timestamps(sk, valbool, true, false);
+		break;
 	case SO_TIMESTAMPNS_OLD:
+		__sock_set_timestamps(sk, valbool, false, true);
+		break;
 	case SO_TIMESTAMPNS_NEW:
-		if (valbool)  {
-			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
-				sock_set_flag(sk, SOCK_TSTAMP_NEW);
-			else
-				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
-			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
-				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
-			else
-				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
-			sock_set_flag(sk, SOCK_RCVTSTAMP);
-			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-		} else {
-			sock_reset_flag(sk, SOCK_RCVTSTAMP);
-			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
-			sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-		}
+		__sock_set_timestamps(sk, valbool, true, true);
 		break;
-
 	case SO_TIMESTAMPING_NEW:
 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
 		/* fall through */
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 01135e54d95d..5ea2bd01fdd5 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -189,13 +189,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		}
 
 		/* We want receive timestamps. */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
-					(char *)&opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		sock_enable_timestamps(local->socket->sk);
 		break;
 
 	default:
-- 
cgit v1.2.3


From ce3d9544cecacd40389c399d2b7ca31acc533b70 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:15 +0200
Subject: net: add sock_set_keepalive

Add a helper to directly set the SO_KEEPALIVE sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c     |  6 +-----
 include/net/sock.h    |  1 +
 net/core/sock.c       | 10 ++++++++++
 net/rds/tcp_listen.c  |  6 +-----
 net/sunrpc/xprtsock.c |  4 +---
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b79711d0aac7..b6e6dba28154 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1142,11 +1142,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		con->sock = NULL;
 		goto create_out;
 	}
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-				 (char *)&one, sizeof(one));
-	if (result < 0) {
-		log_print("Set keepalive failed: %d", result);
-	}
+	sock_set_keepalive(sock->sk);
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
diff --git a/include/net/sock.h b/include/net/sock.h
index 99ef43508d2b..dc08c176238f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2691,6 +2691,7 @@ void sock_def_readable(struct sock *sk);
 int sock_bindtoindex(struct sock *sk, int ifindex);
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
+void sock_set_keepalive(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_reuseaddr(struct sock *sk);
 void sock_set_sndtimeo(struct sock *sk, s64 secs);
diff --git a/net/core/sock.c b/net/core/sock.c
index e4a4dd2b3d8b..728f5fb156a0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -779,6 +779,16 @@ void sock_enable_timestamps(struct sock *sk)
 }
 EXPORT_SYMBOL(sock_enable_timestamps);
 
+void sock_set_keepalive(struct sock *sk)
+{
+	lock_sock(sk);
+	if (sk->sk_prot->keepalive)
+		sk->sk_prot->keepalive(sk, true);
+	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index bbb31b9c0b39..d8bd13276959 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -43,13 +43,9 @@ int rds_tcp_keepalive(struct socket *sock)
 	/* values below based on xs_udp_default_timeout */
 	int keepidle = 5; /* send a probe 'keepidle' secs after last data */
 	int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-	int keepalive = 1;
 	int ret = 0;
 
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-				(char *)&keepalive, sizeof(keepalive));
-	if (ret < 0)
-		goto bail;
+	sock_set_keepalive(sock->sk);
 
 	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
 				(char *)&keepcnt, sizeof(keepcnt));
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 845d0be805ec..30082cd03996 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2110,7 +2110,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	unsigned int keepidle;
 	unsigned int keepcnt;
-	unsigned int opt_on = 1;
 	unsigned int timeo;
 
 	spin_lock(&xprt->transport_lock);
@@ -2122,8 +2121,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	spin_unlock(&xprt->transport_lock);
 
 	/* TCP Keepalive options */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-			(char *)&opt_on, sizeof(opt_on));
+	sock_set_keepalive(sock->sk);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
 			(char *)&keepidle, sizeof(keepidle));
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-- 
cgit v1.2.3


From 26cfabf9cdd273650126d84a48a7f8dedbcded48 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:16 +0200
Subject: net: add sock_set_rcvbuf

Add a helper to directly set the SO_RCVBUFFORCE sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c  |  7 +------
 include/net/sock.h |  1 +
 net/core/sock.c    | 59 +++++++++++++++++++++++++++++-------------------------
 3 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b6e6dba28154..2822a430a2b4 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1180,7 +1180,6 @@ static int sctp_listen_for_all(void)
 	struct socket *sock = NULL;
 	int result = -EINVAL;
 	struct connection *con = nodeid2con(0, GFP_NOFS);
-	int bufsize = NEEDED_RMEM;
 	int one = 1;
 
 	if (!con)
@@ -1195,11 +1194,7 @@ static int sctp_listen_for_all(void)
 		goto out;
 	}
 
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
-				 (char *)&bufsize, sizeof(bufsize));
-	if (result)
-		log_print("Error increasing buffer space on socket %d", result);
-
+	sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
 	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
 				   sizeof(one));
 	if (result < 0)
diff --git a/include/net/sock.h b/include/net/sock.h
index dc08c176238f..c997289aabbf 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2693,6 +2693,7 @@ void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
 void sock_set_keepalive(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
+void sock_set_rcvbuf(struct sock *sk, int val);
 void sock_set_reuseaddr(struct sock *sk);
 void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 728f5fb156a0..3c6ebf952e9a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -789,6 +789,35 @@ void sock_set_keepalive(struct sock *sk)
 }
 EXPORT_SYMBOL(sock_set_keepalive);
 
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+	 * as a negative value.
+	 */
+	val = min_t(int, val, INT_MAX / 2);
+	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+	/* We double it on the way in to account for "struct sk_buff" etc.
+	 * overhead.   Applications assume that the SO_RCVBUF setting they make
+	 * will allow that much actual data to be received on that socket.
+	 *
+	 * Applications are unaware that "struct sk_buff" and other overheads
+	 * allocate from the receive buffer during socket buffer allocation.
+	 *
+	 * And after considering the possible alternatives, returning the value
+	 * we actually used in getsockopt is the most desirable behavior.
+	 */
+	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__sock_set_rcvbuf(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -885,30 +914,7 @@ set_sndbuf:
 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 		 * are treated in BSD as hints
 		 */
-		val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
-		/* Ensure val * 2 fits into an int, to prevent max_t()
-		 * from treating it as a negative value.
-		 */
-		val = min_t(int, val, INT_MAX / 2);
-		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-		/*
-		 * We double it on the way in to account for
-		 * "struct sk_buff" etc. overhead.   Applications
-		 * assume that the SO_RCVBUF setting they make will
-		 * allow that much actual data to be received on that
-		 * socket.
-		 *
-		 * Applications are unaware that "struct sk_buff" and
-		 * other overheads allocate from the receive buffer
-		 * during socket buffer allocation.
-		 *
-		 * And after considering the possible alternatives,
-		 * returning the value we actually used in getsockopt
-		 * is the most desirable behavior.
-		 */
-		WRITE_ONCE(sk->sk_rcvbuf,
-			   max_t(int, val * 2, SOCK_MIN_RCVBUF));
+		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 		break;
 
 	case SO_RCVBUFFORCE:
@@ -920,9 +926,8 @@ set_rcvbuf:
 		/* No negative values (to prevent underflow, as val will be
 		 * multiplied by 2).
 		 */
-		if (val < 0)
-			val = 0;
-		goto set_rcvbuf;
+		__sock_set_rcvbuf(sk, max(val, 0));
+		break;
 
 	case SO_KEEPALIVE:
 		if (sk->sk_prot->keepalive)
-- 
cgit v1.2.3


From fe31a326a4aadb4a3ba2b21deacc380d06802737 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:17 +0200
Subject: net: add sock_set_reuseport

Add a helper to directly set the SO_REUSEPORT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |  1 +
 net/core/sock.c       |  8 ++++++++
 net/sunrpc/xprtsock.c | 17 +----------------
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index c997289aabbf..d994daa418ec 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2695,6 +2695,7 @@ void sock_set_keepalive(struct sock *sk);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_rcvbuf(struct sock *sk, int val);
 void sock_set_reuseaddr(struct sock *sk);
+void sock_set_reuseport(struct sock *sk);
 void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 3c6ebf952e9a..2ca3425b519c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -729,6 +729,14 @@ void sock_set_reuseaddr(struct sock *sk)
 }
 EXPORT_SYMBOL(sock_set_reuseaddr);
 
+void sock_set_reuseport(struct sock *sk)
+{
+	lock_sock(sk);
+	sk->sk_reuseport = true;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
 void sock_no_linger(struct sock *sk)
 {
 	lock_sock(sk);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 30082cd03996..399848c2bcb2 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1594,21 +1594,6 @@ static int xs_get_random_port(void)
 	return rand + min;
 }
 
-/**
- * xs_set_reuseaddr_port - set the socket's port and address reuse options
- * @sock: socket
- *
- * Note that this function has to be called on all sockets that share the
- * same port, and it must be called before binding.
- */
-static void xs_sock_set_reuseport(struct socket *sock)
-{
-	int opt = 1;
-
-	kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
-			(char *)&opt, sizeof(opt));
-}
-
 static unsigned short xs_sock_getport(struct socket *sock)
 {
 	struct sockaddr_storage buf;
@@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
 	xs_reclassify_socket(family, sock);
 
 	if (reuseport)
-		xs_sock_set_reuseport(sock);
+		sock_set_reuseport(sock->sk);
 
 	err = xs_bind(transport, sock);
 	if (err) {
-- 
cgit v1.2.3


From db10538a4b997a77a1fd561adaaa58afc7dcfa2f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:18 +0200
Subject: tcp: add tcp_sock_set_cork

Add a helper to directly set the TCP_CORK sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_int.h      | 14 -----------
 drivers/block/drbd/drbd_receiver.c |  4 +--
 drivers/block/drbd/drbd_worker.c   |  6 ++---
 fs/cifs/transport.c                |  8 ++----
 include/linux/tcp.h                |  2 ++
 net/ipv4/tcp.c                     | 51 ++++++++++++++++++++++++--------------
 net/rds/tcp_send.c                 |  9 ++-----
 7 files changed, 43 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index aae99a2d7bd4..3550adc93c68 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,20 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_cork(struct socket *sock)
-{
-	int val = 1;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char*)&val, sizeof(val));
-}
-
-static inline void drbd_tcp_uncork(struct socket *sock)
-{
-	int val = 0;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char*)&val, sizeof(val));
-}
-
 static inline void drbd_tcp_nodelay(struct socket *sock)
 {
 	int val = 1;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c15e7083b13a..55ea907ad33c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -6162,7 +6162,7 @@ void drbd_send_acks_wf(struct work_struct *ws)
 	rcu_read_unlock();
 
 	if (tcp_cork)
-		drbd_tcp_cork(connection->meta.socket);
+		tcp_sock_set_cork(connection->meta.socket->sk, true);
 
 	err = drbd_finish_peer_reqs(device);
 	kref_put(&device->kref, drbd_destroy_device);
@@ -6175,7 +6175,7 @@ void drbd_send_acks_wf(struct work_struct *ws)
 	}
 
 	if (tcp_cork)
-		drbd_tcp_uncork(connection->meta.socket);
+		tcp_sock_set_cork(connection->meta.socket->sk, false);
 
 	return;
 }
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0dc019da1f8d..2b89c9f2ca70 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 	if (uncork) {
 		mutex_lock(&connection->data.mutex);
 		if (connection->data.socket)
-			drbd_tcp_uncork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, false);
 		mutex_unlock(&connection->data.mutex);
 	}
 
@@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 	mutex_lock(&connection->data.mutex);
 	if (connection->data.socket) {
 		if (cork)
-			drbd_tcp_cork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, true);
 		else if (!uncork)
-			drbd_tcp_uncork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, false);
 	}
 	mutex_unlock(&connection->data.mutex);
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c97570eb2c18..99760063e000 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	size_t total_len = 0, sent, size;
 	struct socket *ssocket = server->ssocket;
 	struct msghdr smb_msg;
-	int val = 1;
 	__be32 rfc1002_marker;
 
 	if (cifs_rdma_enabled(server)) {
@@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	}
 
 	/* cork the socket */
-	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
-				(char *)&val, sizeof(val));
+	tcp_sock_set_cork(ssocket->sk, true);
 
 	for (j = 0; j < num_rqst; j++)
 		send_length += smb_rqst_len(server, &rqst[j]);
@@ -435,9 +433,7 @@ unmask:
 	}
 
 	/* uncork it */
-	val = 0;
-	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
-				(char *)&val, sizeof(val));
+	tcp_sock_set_cork(ssocket->sk, false);
 
 	if ((total_len > 0) && (total_len != send_length)) {
 		cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bf44e85d709d..889eeb2256c2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -497,4 +497,6 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
 int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
+void tcp_sock_set_cork(struct sock *sk, bool on);
+
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970064996377..e6cf702e16d6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2801,6 +2801,37 @@ static void tcp_enable_tx_delay(void)
 	}
 }
 
+/* When set indicates to always queue non-full frames.  Later the user clears
+ * this option and we transmit any pending partial frames in the queue.  This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+static void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (on) {
+		tp->nonagle |= TCP_NAGLE_CORK;
+	} else {
+		tp->nonagle &= ~TCP_NAGLE_CORK;
+		if (tp->nonagle & TCP_NAGLE_OFF)
+			tp->nonagle |= TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	}
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	lock_sock(sk);
+	__tcp_sock_set_cork(sk, on);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
+
 /*
  *	Socket option code for TCP.
  */
@@ -2979,25 +3010,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_CORK:
-		/* When set indicates to always queue non-full frames.
-		 * Later the user clears this option and we transmit
-		 * any pending partial frames in the queue.  This is
-		 * meant to be used alongside sendfile() to get properly
-		 * filled frames when the user (for example) must write
-		 * out headers with a write() call first and then use
-		 * sendfile to send out the data parts.
-		 *
-		 * TCP_CORK can be set together with TCP_NODELAY and it is
-		 * stronger than TCP_NODELAY.
-		 */
-		if (val) {
-			tp->nonagle |= TCP_NAGLE_CORK;
-		} else {
-			tp->nonagle &= ~TCP_NAGLE_CORK;
-			if (tp->nonagle&TCP_NAGLE_OFF)
-				tp->nonagle |= TCP_NAGLE_PUSH;
-			tcp_push_pending_frames(sk);
-		}
+		__tcp_sock_set_cork(sk, val);
 		break;
 
 	case TCP_KEEPIDLE:
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 78a2554a4497..8c4d1d6e9249 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -38,23 +38,18 @@
 #include "rds.h"
 #include "tcp.h"
 
-static void rds_tcp_cork(struct socket *sock, int val)
-{
-	kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
-}
-
 void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
 {
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
 
-	rds_tcp_cork(tc->t_sock, 1);
+	tcp_sock_set_cork(tc->t_sock->sk, true);
 }
 
 void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
 {
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
 
-	rds_tcp_cork(tc->t_sock, 0);
+	tcp_sock_set_cork(tc->t_sock->sk, false);
 }
 
 /* the core send_sem serializes this with other xmit and shutdown */
-- 
cgit v1.2.3


From 12abc5ee7873a085cc280240822b8ac53c86fecd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:19 +0200
Subject: tcp: add tcp_sock_set_nodelay

Add a helper to directly set the TCP_NODELAY sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_int.h             |  7 ------
 drivers/block/drbd/drbd_main.c            |  2 +-
 drivers/block/drbd/drbd_receiver.c        |  4 ++--
 drivers/infiniband/sw/siw/siw_cm.c        | 24 ++++---------------
 drivers/nvme/host/tcp.c                   |  9 +------
 drivers/nvme/target/tcp.c                 | 12 ++--------
 drivers/target/iscsi/iscsi_target_login.c | 15 +++---------
 fs/cifs/connect.c                         | 10 ++------
 fs/dlm/lowcomms.c                         |  8 ++-----
 fs/ocfs2/cluster/tcp.c                    | 20 ++--------------
 include/linux/tcp.h                       |  1 +
 net/ceph/messenger.c                      | 11 ++-------
 net/ipv4/tcp.c                            | 39 ++++++++++++++++++++-----------
 net/rds/tcp.c                             | 11 +--------
 net/rds/tcp.h                             |  1 -
 net/rds/tcp_listen.c                      |  2 +-
 16 files changed, 49 insertions(+), 127 deletions(-)

(limited to 'net')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3550adc93c68..e24bba87c8e0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_nodelay(struct socket *sock)
-{
-	int val = 1;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-			(char*)&val, sizeof(val));
-}
-
 static inline void drbd_tcp_quickack(struct socket *sock)
 {
 	int val = 2;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c094c3c2c5d4..45fbd526c453 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr,
 	/* DRBD protocol "pings" are latency critical.
 	 * This is supposed to trigger tcp_push_pending_frames() */
 	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
-		drbd_tcp_nodelay(sock->socket);
+		tcp_sock_set_nodelay(sock->socket->sk);
 
 	return err;
 }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 55ea907ad33c..20a5e94494ac 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1051,8 +1051,8 @@ randomize:
 
 	/* we don't want delays.
 	 * we use TCP_CORK where appropriate, though */
-	drbd_tcp_nodelay(sock.socket);
-	drbd_tcp_nodelay(msock.socket);
+	tcp_sock_set_nodelay(sock.socket->sk);
+	tcp_sock_set_nodelay(msock.socket->sk);
 
 	connection->data.socket = sock.socket;
 	connection->meta.socket = msock.socket;
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index d1860f3e8740..1662216be66d 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep)
 	siw_cep_get(new_cep);
 	new_s->sk->sk_user_data = new_cep;
 
-	if (siw_tcp_nagle == false) {
-		int val = 1;
-
-		rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY,
-				       (char *)&val, sizeof(val));
-		if (rv) {
-			siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv);
-			goto error;
-		}
-	}
+	if (siw_tcp_nagle == false)
+		tcp_sock_set_nodelay(new_s->sk);
 	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
 
 	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
@@ -1386,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
 		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
 		goto error;
 	}
-	if (siw_tcp_nagle == false) {
-		int val = 1;
-
-		rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val,
-				       sizeof(val));
-		if (rv) {
-			siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv);
-			goto error;
-		}
-	}
+	if (siw_tcp_nagle == false)
+		tcp_sock_set_nodelay(s->sk);
 	cep = siw_cep_alloc(sdev);
 	if (!cep) {
 		rv = -ENOMEM;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a307972d33a0..4e4a750ecdb9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1346,14 +1346,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	/* Set TCP no delay */
-	opt = 1;
-	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
-			TCP_NODELAY, (char *)&opt, sizeof(opt));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set TCP_NODELAY sock opt %d\n", ret);
-		goto err_sock;
-	}
+	tcp_sock_set_nodelay(queue->sock->sk);
 
 	/*
 	 * Cleanup whatever is sitting in the TCP transmit queue on socket
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index f3088156d01d..55bc4c3c0a74 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1580,7 +1580,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 {
 	struct nvmet_tcp_port *port;
 	__kernel_sa_family_t af;
-	int opt, ret;
+	int ret;
 
 	port = kzalloc(sizeof(*port), GFP_KERNEL);
 	if (!port)
@@ -1625,15 +1625,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	port->data_ready = port->sock->sk->sk_data_ready;
 	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
 	sock_set_reuseaddr(port->sock->sk);
-
-	opt = 1;
-	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
-			TCP_NODELAY, (char *)&opt, sizeof(opt));
-	if (ret) {
-		pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
-		goto err_sock;
-	}
-
+	tcp_sock_set_nodelay(port->sock->sk);
 	if (so_priority > 0)
 		sock_set_priority(port->sock->sk, so_priority);
 
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 91acb3f07b4c..b561b07a869a 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -897,20 +897,11 @@ int iscsit_setup_np(
 	/*
 	 * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY.
 	 */
-	/* FIXME: Someone please explain why this is endian-safe */
-	opt = 1;
-	if (np->np_network_transport == ISCSI_TCP) {
-		ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
-				(char *)&opt, sizeof(opt));
-		if (ret < 0) {
-			pr_err("kernel_setsockopt() for TCP_NODELAY"
-				" failed: %d\n", ret);
-			goto fail;
-		}
-	}
-
+	if (np->np_network_transport == ISCSI_TCP)
+		tcp_sock_set_nodelay(sock->sk);
 	sock_set_reuseaddr(sock->sk);
 
+	opt = 1;
 	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
 			(char *)&opt, sizeof(opt));
 	if (ret < 0) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 28268ed461b8..ad8fb53b3682 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
 			socket->sk->sk_rcvbuf = 140 * 1024;
 	}
 
-	if (server->tcp_nodelay) {
-		int val = 1;
-		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-				(char *)&val, sizeof(val));
-		if (rc)
-			cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
-				 rc);
-	}
+	if (server->tcp_nodelay)
+		tcp_sock_set_nodelay(socket->sk);
 
 	cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
 		 socket->sk->sk_sndbuf,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 2822a430a2b4..69333728d871 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1011,7 +1011,6 @@ static void tcp_connect_to_sock(struct connection *con)
 	struct sockaddr_storage saddr, src_addr;
 	int addr_len;
 	struct socket *sock = NULL;
-	int one = 1;
 	int result;
 
 	if (con->nodeid == 0) {
@@ -1060,8 +1059,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	log_print("connecting to %d", con->nodeid);
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
-			  sizeof(one));
+	tcp_sock_set_nodelay(sock->sk);
 
 	result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
 				   O_NONBLOCK);
@@ -1103,7 +1101,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 {
 	struct socket *sock = NULL;
 	int result = 0;
-	int one = 1;
 	int addr_len;
 
 	if (dlm_local_addr[0]->ss_family == AF_INET)
@@ -1120,8 +1117,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 	}
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
-			  sizeof(one));
+	tcp_sock_set_nodelay(sock->sk);
 
 	sock_set_reuseaddr(sock->sk);
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2c512b40a940..4c70fe9d19ab 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work)
 	sc_put(sc);
 }
 
-static int o2net_set_nodelay(struct socket *sock)
-{
-	int val = 1;
-
-	return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				    (void *)&val, sizeof(val));
-}
-
 static int o2net_set_usertimeout(struct socket *sock)
 {
 	int user_timeout = O2NET_TCP_USER_TIMEOUT;
@@ -1636,11 +1628,7 @@ static void o2net_start_connect(struct work_struct *work)
 		goto out;
 	}
 
-	ret = o2net_set_nodelay(sc->sc_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_nodelay(sc->sc_sock->sk);
 
 	ret = o2net_set_usertimeout(sock);
 	if (ret) {
@@ -1832,11 +1820,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	*more = 1;
 	new_sock->sk->sk_allocation = GFP_ATOMIC;
 
-	ret = o2net_set_nodelay(new_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_nodelay(new_sock->sk);
 
 	ret = o2net_set_usertimeout(new_sock);
 	if (ret) {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 889eeb2256c2..9e42c7fe50a8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -498,5 +498,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
+void tcp_sock_set_nodelay(struct sock *sk);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f8ca5edc5f2c..27d6ab11f9ee 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		return ret;
 	}
 
-	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
-		int optval = 1;
-
-		ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-					(char *)&optval, sizeof(optval));
-		if (ret)
-			pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
-			       ret);
-	}
+	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
+		tcp_sock_set_nodelay(sock->sk);
 
 	con->sock = sock;
 	return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e6cf702e16d6..a65f293a19fa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2832,6 +2832,30 @@ void tcp_sock_set_cork(struct sock *sk, bool on)
 }
 EXPORT_SYMBOL(tcp_sock_set_cork);
 
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+	if (on) {
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	} else {
+		tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+	}
+}
+
+void tcp_sock_set_nodelay(struct sock *sk)
+{
+	lock_sock(sk);
+	__tcp_sock_set_nodelay(sk, true);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
 /*
  *	Socket option code for TCP.
  */
@@ -2929,20 +2953,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_NODELAY:
-		if (val) {
-			/* TCP_NODELAY is weaker than TCP_CORK, so that
-			 * this option on corked socket is remembered, but
-			 * it is not activated until cork is cleared.
-			 *
-			 * However, when TCP_NODELAY is set we make
-			 * an explicit push, which overrides even TCP_CORK
-			 * for currently queued segments.
-			 */
-			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
-			tcp_push_pending_frames(sk);
-		} else {
-			tp->nonagle &= ~TCP_NAGLE_OFF;
-		}
+		__tcp_sock_set_nodelay(sk, val);
 		break;
 
 	case TCP_THIN_LINEAR_TIMEOUTS:
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 46782fac4c16..43db0eca911f 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
 	{ }
 };
 
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
-	int val = 1;
-
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
-			      sizeof(val));
-}
-
 u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
 {
 	/* seq# of the last byte of data in tcp send buffer */
@@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock)
 	struct net *net = sock_net(sk);
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 
-	rds_tcp_nonagle(sock);
+	tcp_sock_set_nodelay(sock->sk);
 	lock_sock(sk);
 	if (rtn->sndbuf_size > 0) {
 		sk->sk_sndbuf = rtn->sndbuf_size;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index d640e210b97b..f6d75d8cb167 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,7 +50,6 @@ struct rds_tcp_statistics {
 
 /* tcp.c */
 void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_restore_callbacks(struct socket *sock,
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index d8bd13276959..6f90ea077adc 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -288,7 +288,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
 	}
 
 	sock->sk->sk_reuse = SK_CAN_REUSE;
-	rds_tcp_nonagle(sock);
+	tcp_sock_set_nodelay(sock->sk);
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = sock->sk->sk_data_ready;
-- 
cgit v1.2.3


From ddd061b8daed3ce0c01109a69c9a2a9f9669f01a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:20 +0200
Subject: tcp: add tcp_sock_set_quickack

Add a helper to directly set the TCP_QUICKACK sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_int.h      |  7 -------
 drivers/block/drbd/drbd_receiver.c |  5 ++---
 include/linux/tcp.h                |  1 +
 net/ipv4/tcp.c                     | 39 +++++++++++++++++++++++++-------------
 4 files changed, 29 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e24bba87c8e0..14345a87c7cc 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_quickack(struct socket *sock)
-{
-	int val = 2;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
-			(char*)&val, sizeof(val));
-}
-
 /* sets the number of 512 byte sectors of our virtual device */
 void drbd_set_my_capacity(struct drbd_device *device, sector_t size);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 20a5e94494ac..3a3f2b6a821f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str
 		 * quickly as possible, and let remote TCP know what we have
 		 * received so far. */
 		if (err == -EAGAIN) {
-			drbd_tcp_quickack(connection->data.socket);
+			tcp_sock_set_quickack(connection->data.socket->sk, 2);
 			drbd_unplug_all_devices(connection);
 		}
 		if (err > 0) {
@@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe
 {
 	/* Make sure we've acked all the TCP data associated
 	 * with the data requests being unplugged */
-	drbd_tcp_quickack(connection->data.socket);
-
+	tcp_sock_set_quickack(connection->data.socket->sk, 2);
 	return 0;
 }
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9e42c7fe50a8..2eaf8320b9db 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -499,5 +499,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk);
+void tcp_sock_set_quickack(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a65f293a19fa..27b5e7a4e2ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2856,6 +2856,31 @@ void tcp_sock_set_nodelay(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_sock_set_nodelay);
 
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	if (!val) {
+		inet_csk_enter_pingpong_mode(sk);
+		return;
+	}
+
+	inet_csk_exit_pingpong_mode(sk);
+	if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+	    inet_csk_ack_scheduled(sk)) {
+		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+		tcp_cleanup_rbuf(sk, 1);
+		if (!(val & 1))
+			inet_csk_enter_pingpong_mode(sk);
+	}
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__tcp_sock_set_quickack(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
 /*
  *	Socket option code for TCP.
  */
@@ -3096,19 +3121,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_QUICKACK:
-		if (!val) {
-			inet_csk_enter_pingpong_mode(sk);
-		} else {
-			inet_csk_exit_pingpong_mode(sk);
-			if ((1 << sk->sk_state) &
-			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
-			    inet_csk_ack_scheduled(sk)) {
-				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				tcp_cleanup_rbuf(sk, 1);
-				if (!(val & 1))
-					inet_csk_enter_pingpong_mode(sk);
-			}
-		}
+		__tcp_sock_set_quickack(sk, val);
 		break;
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.3


From 557eadfcc5ee8f8fa98a795e05ed21db58a65db5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:21 +0200
Subject: tcp: add tcp_sock_set_syncnt

Add a helper to directly set the TCP_SYNCNT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nvme/host/tcp.c |  9 +--------
 include/linux/tcp.h     |  1 +
 net/ipv4/tcp.c          | 12 ++++++++++++
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4e4a750ecdb9..2872584f52f6 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1336,14 +1336,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	/* Single syn retry */
-	opt = 1;
-	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
-			(char *)&opt, sizeof(opt));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set TCP_SYNCNT sock opt %d\n", ret);
-		goto err_sock;
-	}
+	tcp_sock_set_syncnt(queue->sock->sk, 1);
 
 	/* Set TCP no delay */
 	tcp_sock_set_nodelay(queue->sock->sk);
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2eaf8320b9db..6aa4ae5ebf3d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -500,5 +500,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
+int tcp_sock_set_syncnt(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27b5e7a4e2ef..d2c67ae1da07 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2881,6 +2881,18 @@ void tcp_sock_set_quickack(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_quickack);
 
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_SYNCNT)
+		return -EINVAL;
+
+	lock_sock(sk);
+	inet_csk(sk)->icsk_syn_retries = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
 /*
  *	Socket option code for TCP.
  */
-- 
cgit v1.2.3


From c488aeadcbd002a992593e6090d54e8ac27c4310 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:22 +0200
Subject: tcp: add tcp_sock_set_user_timeout

Add a helper to directly set the TCP_USER_TIMEOUT sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/ocfs2/cluster/tcp.c | 22 ++--------------------
 include/linux/tcp.h    |  1 +
 net/ipv4/tcp.c         |  8 ++++++++
 net/sunrpc/xprtsock.c  |  3 +--
 4 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 4c70fe9d19ab..79a231719460 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work)
 	sc_put(sc);
 }
 
-static int o2net_set_usertimeout(struct socket *sock)
-{
-	int user_timeout = O2NET_TCP_USER_TIMEOUT;
-
-	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-				(void *)&user_timeout, sizeof(user_timeout));
-}
-
 static void o2net_initialize_handshake(void)
 {
 	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1629,12 +1621,7 @@ static void o2net_start_connect(struct work_struct *work)
 	}
 
 	tcp_sock_set_nodelay(sc->sc_sock->sk);
-
-	ret = o2net_set_usertimeout(sock);
-	if (ret) {
-		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT);
 
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 
@@ -1821,12 +1808,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	new_sock->sk->sk_allocation = GFP_ATOMIC;
 
 	tcp_sock_set_nodelay(new_sock->sk);
-
-	ret = o2net_set_usertimeout(new_sock);
-	if (ret) {
-		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT);
 
 	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1);
 	if (ret < 0)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6aa4ae5ebf3d..de682143efe4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -501,5 +501,6 @@ void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d2c67ae1da07..0004bd9ae7b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2893,6 +2893,14 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_syncnt);
 
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
+{
+	lock_sock(sk);
+	inet_csk(sk)->icsk_user_timeout = val;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 399848c2bcb2..231fd6162f68 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2115,8 +2115,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 			(char *)&keepcnt, sizeof(keepcnt));
 
 	/* TCP user timeout (see RFC5482) */
-	kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-			(char *)&timeo, sizeof(timeo));
+	tcp_sock_set_user_timeout(sock->sk, timeo);
 }
 
 static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
-- 
cgit v1.2.3


From 71c48eb81c9ecb6fed49dc33e7c9b621fdcb7bf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:23 +0200
Subject: tcp: add tcp_sock_set_keepidle

Add a helper to directly set the TCP_KEEP_IDLE sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 49 ++++++++++++++++++++++++++++++++++---------------
 net/rds/tcp_listen.c  |  5 +----
 net/sunrpc/xprtsock.c |  3 +--
 4 files changed, 37 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index de682143efe4..5724dd84a85e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
+int tcp_sock_set_keepidle(struct sock *sk, int val);
 void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0004bd9ae7b0..bdf0ff933351 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2901,6 +2901,39 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
 }
 EXPORT_SYMBOL(tcp_sock_set_user_timeout);
 
+static int __tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (val < 1 || val > MAX_TCP_KEEPIDLE)
+		return -EINVAL;
+
+	tp->keepalive_time = val * HZ;
+	if (sock_flag(sk, SOCK_KEEPOPEN) &&
+	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+		u32 elapsed = keepalive_time_elapsed(tp);
+
+		if (tp->keepalive_time > elapsed)
+			elapsed = tp->keepalive_time - elapsed;
+		else
+			elapsed = 0;
+		inet_csk_reset_keepalive_timer(sk, elapsed);
+	}
+
+	return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+	int err;
+
+	lock_sock(sk);
+	err = __tcp_sock_set_keepidle(sk, val);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
 /*
  *	Socket option code for TCP.
  */
@@ -3070,21 +3103,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_KEEPIDLE:
-		if (val < 1 || val > MAX_TCP_KEEPIDLE)
-			err = -EINVAL;
-		else {
-			tp->keepalive_time = val * HZ;
-			if (sock_flag(sk, SOCK_KEEPOPEN) &&
-			    !((1 << sk->sk_state) &
-			      (TCPF_CLOSE | TCPF_LISTEN))) {
-				u32 elapsed = keepalive_time_elapsed(tp);
-				if (tp->keepalive_time > elapsed)
-					elapsed = tp->keepalive_time - elapsed;
-				else
-					elapsed = 0;
-				inet_csk_reset_keepalive_timer(sk, elapsed);
-			}
-		}
+		err = __tcp_sock_set_keepidle(sk, val);
 		break;
 	case TCP_KEEPINTVL:
 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 6f90ea077adc..79f9adc00811 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -52,10 +52,7 @@ int rds_tcp_keepalive(struct socket *sock)
 	if (ret < 0)
 		goto bail;
 
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
-				(char *)&keepidle, sizeof(keepidle));
-	if (ret < 0)
-		goto bail;
+	tcp_sock_set_keepidle(sock->sk, keepidle);
 
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 231fd6162f68..473290f7c5c0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2107,8 +2107,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 
 	/* TCP Keepalive options */
 	sock_set_keepalive(sock->sk);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
-			(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepidle(sock->sk, keepidle);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
 			(char *)&keepidle, sizeof(keepidle));
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-- 
cgit v1.2.3


From d41ecaac903c9f4658a71d4e7a708673cfb5abba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:24 +0200
Subject: tcp: add tcp_sock_set_keepintvl

Add a helper to directly set the TCP_KEEPINTVL sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 12 ++++++++++++
 net/rds/tcp_listen.c  |  4 +---
 net/sunrpc/xprtsock.c |  3 +--
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 5724dd84a85e..1f9bada00faa 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -499,6 +499,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
 int tcp_sock_set_keepidle(struct sock *sk, int val);
+int tcp_sock_set_keepintvl(struct sock *sk, int val);
 void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bdf0ff933351..7eb083e09786 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2934,6 +2934,18 @@ int tcp_sock_set_keepidle(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_keepidle);
 
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPINTVL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	tcp_sk(sk)->keepalive_intvl = val * HZ;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 79f9adc00811..9ad555c48d15 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -53,12 +53,10 @@ int rds_tcp_keepalive(struct socket *sock)
 		goto bail;
 
 	tcp_sock_set_keepidle(sock->sk, keepidle);
-
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
 	 */
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
-				(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepintvl(sock->sk, keepidle);
 bail:
 	return ret;
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 473290f7c5c0..5ca64e12af0c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2108,8 +2108,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	/* TCP Keepalive options */
 	sock_set_keepalive(sock->sk);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-			(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepintvl(sock->sk, keepidle);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
 			(char *)&keepcnt, sizeof(keepcnt));
 
-- 
cgit v1.2.3


From 480aeb9639d6a077c611b303a22f9b1e5937d081 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:25 +0200
Subject: tcp: add tcp_sock_set_keepcnt

Add a helper to directly set the TCP_KEEPCNT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 12 ++++++++++++
 net/rds/tcp.h         |  2 +-
 net/rds/tcp_listen.c  | 17 +++--------------
 net/sunrpc/xprtsock.c |  3 +--
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1f9bada00faa..9aac824c523c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -498,6 +498,7 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
+int tcp_sock_set_keepcnt(struct sock *sk, int val);
 int tcp_sock_set_keepidle(struct sock *sk, int val);
 int tcp_sock_set_keepintvl(struct sock *sk, int val);
 void tcp_sock_set_nodelay(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7eb083e09786..15d47d5e7951 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2946,6 +2946,18 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_keepintvl);
 
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPCNT)
+		return -EINVAL;
+
+	lock_sock(sk);
+	tcp_sk(sk)->keepalive_probes = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index f6d75d8cb167..bad9cf49d565 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -70,7 +70,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
 void rds_tcp_listen_data_ready(struct sock *sk);
 int rds_tcp_accept_one(struct socket *sock);
-int rds_tcp_keepalive(struct socket *sock);
+void rds_tcp_keepalive(struct socket *sock);
 void *rds_tcp_listen_sock_def_readable(struct net *net);
 
 /* tcp_recv.c */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 9ad555c48d15..101cf14215a0 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -38,27 +38,19 @@
 #include "rds.h"
 #include "tcp.h"
 
-int rds_tcp_keepalive(struct socket *sock)
+void rds_tcp_keepalive(struct socket *sock)
 {
 	/* values below based on xs_udp_default_timeout */
 	int keepidle = 5; /* send a probe 'keepidle' secs after last data */
 	int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-	int ret = 0;
 
 	sock_set_keepalive(sock->sk);
-
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
-				(char *)&keepcnt, sizeof(keepcnt));
-	if (ret < 0)
-		goto bail;
-
+	tcp_sock_set_keepcnt(sock->sk, keepcnt);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
 	 */
 	tcp_sock_set_keepintvl(sock->sk, keepidle);
-bail:
-	return ret;
 }
 
 /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
@@ -140,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock)
 	new_sock->ops = sock->ops;
 	__module_get(new_sock->ops->owner);
 
-	ret = rds_tcp_keepalive(new_sock);
-	if (ret < 0)
-		goto out;
-
+	rds_tcp_keepalive(new_sock);
 	rds_tcp_tune(new_sock);
 
 	inet = inet_sk(new_sock->sk);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5ca64e12af0c..0d3ec055bc12 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2109,8 +2109,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	sock_set_keepalive(sock->sk);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
 	tcp_sock_set_keepintvl(sock->sk, keepidle);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-			(char *)&keepcnt, sizeof(keepcnt));
+	tcp_sock_set_keepcnt(sock->sk, keepcnt);
 
 	/* TCP user timeout (see RFC5482) */
 	tcp_sock_set_user_timeout(sock->sk, timeo);
-- 
cgit v1.2.3


From 6ebf71bab9fb476fc8132be4c12b88201278f0ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:26 +0200
Subject: ipv4: add ip_sock_set_tos

Add a helper to directly set the IP_TOS sockopt from kernel space without
going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nvme/host/tcp.c   | 14 +++-----------
 drivers/nvme/target/tcp.c | 10 ++--------
 include/net/ip.h          |  2 ++
 net/ipv4/ip_sockglue.c    | 30 +++++++++++++++++++++---------
 4 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 2872584f52f6..4c972d8abf31 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1313,7 +1313,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
-	int ret, opt, rcv_pdu_size;
+	int ret, rcv_pdu_size;
 
 	queue->ctrl = ctrl;
 	INIT_LIST_HEAD(&queue->send_list);
@@ -1352,16 +1352,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 		sock_set_priority(queue->sock->sk, so_priority);
 
 	/* Set socket type of service */
-	if (nctrl->opts->tos >= 0) {
-		opt = nctrl->opts->tos;
-		ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
-				(char *)&opt, sizeof(opt));
-		if (ret) {
-			dev_err(nctrl->device,
-				"failed to set IP_TOS sock opt %d\n", ret);
-			goto err_sock;
-		}
-	}
+	if (nctrl->opts->tos >= 0)
+		ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	nvme_tcp_set_queue_io_cpu(queue);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 55bc4c3c0a74..4546049a96b3 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1452,14 +1452,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 		sock_set_priority(sock->sk, so_priority);
 
 	/* Set socket type of service */
-	if (inet->rcv_tos > 0) {
-		int tos = inet->rcv_tos;
-
-		ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
-				(char *)&tos, sizeof(tos));
-		if (ret)
-			return ret;
-	}
+	if (inet->rcv_tos > 0)
+		ip_sock_set_tos(sock->sk, inet->rcv_tos);
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = queue;
diff --git a/include/net/ip.h b/include/net/ip.h
index 5b317c9f4470..2fc52e26fa88 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -765,4 +765,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 	return likely(mtu >= IPV4_MIN_MTU);
 }
 
+void ip_sock_set_tos(struct sock *sk, int val);
+
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f43d5f12aa86..b43a29e11f4a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -560,6 +560,26 @@ out:
 	return err;
 }
 
+static void __ip_sock_set_tos(struct sock *sk, int val)
+{
+	if (sk->sk_type == SOCK_STREAM) {
+		val &= ~INET_ECN_MASK;
+		val |= inet_sk(sk)->tos & INET_ECN_MASK;
+	}
+	if (inet_sk(sk)->tos != val) {
+		inet_sk(sk)->tos = val;
+		sk->sk_priority = rt_tos2priority(val);
+		sk_dst_reset(sk);
+	}
+}
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__ip_sock_set_tos(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_tos);
 
 /*
  *	Socket option code for IP. This is the end of the line after any
@@ -823,15 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
 		break;
 	case IP_TOS:	/* This sets both TOS and Precedence */
-		if (sk->sk_type == SOCK_STREAM) {
-			val &= ~INET_ECN_MASK;
-			val |= inet->tos & INET_ECN_MASK;
-		}
-		if (inet->tos != val) {
-			inet->tos = val;
-			sk->sk_priority = rt_tos2priority(val);
-			sk_dst_reset(sk);
-		}
+		__ip_sock_set_tos(sk, val);
 		break;
 	case IP_TTL:
 		if (optlen < 1)
-- 
cgit v1.2.3


From c4e446bf5a06a1db24b4f0115a89f0380a495c62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:27 +0200
Subject: ipv4: add ip_sock_set_freebind

Add a helper to directly set the IP_FREEBIND sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/target/iscsi/iscsi_target_login.c | 13 +++----------
 include/net/ip.h                          |  1 +
 net/ipv4/ip_sockglue.c                    |  8 ++++++++
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index b561b07a869a..85748e338858 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/idr.h>
 #include <linux/tcp.h>        /* TCP_NODELAY */
+#include <net/ip.h>
 #include <net/ipv6.h>         /* ipv6_addr_v4mapped() */
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
@@ -855,7 +856,7 @@ int iscsit_setup_np(
 	struct sockaddr_storage *sockaddr)
 {
 	struct socket *sock = NULL;
-	int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len;
+	int backlog = ISCSIT_TCP_BACKLOG, ret, len;
 
 	switch (np->np_network_transport) {
 	case ISCSI_TCP:
@@ -900,15 +901,7 @@ int iscsit_setup_np(
 	if (np->np_network_transport == ISCSI_TCP)
 		tcp_sock_set_nodelay(sock->sk);
 	sock_set_reuseaddr(sock->sk);
-
-	opt = 1;
-	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
-			(char *)&opt, sizeof(opt));
-	if (ret < 0) {
-		pr_err("kernel_setsockopt() for IP_FREEBIND"
-			" failed\n");
-		goto fail;
-	}
+	ip_sock_set_freebind(sock->sk);
 
 	ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len);
 	if (ret < 0) {
diff --git a/include/net/ip.h b/include/net/ip.h
index 2fc52e26fa88..5f5d8226b6ab 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -765,6 +765,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 	return likely(mtu >= IPV4_MIN_MTU);
 }
 
+void ip_sock_set_freebind(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b43a29e11f4a..767838d2030d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -581,6 +581,14 @@ void ip_sock_set_tos(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(ip_sock_set_tos);
 
+void ip_sock_set_freebind(struct sock *sk)
+{
+	lock_sock(sk);
+	inet_sk(sk)->freebind = true;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_freebind);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
-- 
cgit v1.2.3


From db45c0ef258ef6c7ef3c1b8ea9e06e133e083c27 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:28 +0200
Subject: ipv4: add ip_sock_set_recverr

Add a helper to directly set the IP_RECVERR sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         | 1 +
 net/ipv4/ip_sockglue.c   | 8 ++++++++
 net/rxrpc/local_object.c | 8 +-------
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 5f5d8226b6ab..f063a491b906 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 }
 
 void ip_sock_set_freebind(struct sock *sk);
+void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 767838d2030d..aca6b81da9ba 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -589,6 +589,14 @@ void ip_sock_set_freebind(struct sock *sk)
 }
 EXPORT_SYMBOL(ip_sock_set_freebind);
 
+void ip_sock_set_recverr(struct sock *sk)
+{
+	lock_sock(sk);
+	inet_sk(sk)->recverr = true;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_recverr);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 5ea2bd01fdd5..4c0e8fe5ec1f 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -171,13 +171,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		/* Fall through */
 	case AF_INET:
 		/* we want to receive ICMP errors */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip_sock_set_recverr(local->socket->sk);
 
 		/* we want to set the don't fragment bit */
 		opt = IP_PMTUDISC_DO;
-- 
cgit v1.2.3


From 2de569bda2a66d1308ad3f205bb29cf4f95f5636 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:29 +0200
Subject: ipv4: add ip_sock_set_mtu_discover

Add a helper to directly set the IP_MTU_DISCOVER sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com> [rxrpc bits]
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  1 +
 net/ipv4/ip_sockglue.c   | 11 +++++++++++
 net/rxrpc/local_object.c |  8 +-------
 net/rxrpc/output.c       | 14 +++++---------
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index f063a491b906..d3649c49dd33 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -766,6 +766,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 }
 
 void ip_sock_set_freebind(struct sock *sk);
+int ip_sock_set_mtu_discover(struct sock *sk, int val);
 void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index aca6b81da9ba..aa115be11dcf 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -597,6 +597,17 @@ void ip_sock_set_recverr(struct sock *sk)
 }
 EXPORT_SYMBOL(ip_sock_set_recverr);
 
+int ip_sock_set_mtu_discover(struct sock *sk, int val)
+{
+	if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+		return -EINVAL;
+	lock_sock(sk);
+	inet_sk(sk)->pmtudisc = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(ip_sock_set_mtu_discover);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 4c0e8fe5ec1f..6f4e6b4817cf 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -174,13 +174,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		ip_sock_set_recverr(local->socket->sk);
 
 		/* we want to set the don't fragment bit */
-		opt = IP_PMTUDISC_DO;
-		ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO);
 
 		/* We want receive timestamps. */
 		sock_enable_timestamps(local->socket->sk);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f8b632a5c619..1ba43c3df4ad 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
 	size_t len;
-	int ret, opt;
+	int ret;
 
 	_enter(",{%d}", skb->len);
 
@@ -473,18 +473,14 @@ send_fragmentable:
 	switch (conn->params.local->srx.transport.family) {
 	case AF_INET6:
 	case AF_INET:
-		opt = IP_PMTUDISC_DONT;
-		kernel_setsockopt(conn->params.local->socket,
-				  SOL_IP, IP_MTU_DISCOVER,
-				  (char *)&opt, sizeof(opt));
+		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+				IP_PMTUDISC_DONT);
 		ret = kernel_sendmsg(conn->params.local->socket, &msg,
 				     iov, 2, len);
 		conn->params.peer->last_tx_at = ktime_get_seconds();
 
-		opt = IP_PMTUDISC_DO;
-		kernel_setsockopt(conn->params.local->socket,
-				  SOL_IP, IP_MTU_DISCOVER,
-				  (char *)&opt, sizeof(opt));
+		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+				IP_PMTUDISC_DO);
 		break;
 
 	default:
-- 
cgit v1.2.3


From c1f9ec5776dd05eaf62cf6788ecdfc905dc8ec2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:30 +0200
Subject: ipv4: add ip_sock_set_pktinfo

Add a helper to directly set the IP_PKTINFO sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 1 +
 net/ipv4/ip_sockglue.c | 8 ++++++++
 net/sunrpc/svcsock.c   | 5 ++---
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index d3649c49dd33..04ebe7bf54c6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -767,6 +767,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 
 void ip_sock_set_freebind(struct sock *sk);
 int ip_sock_set_mtu_discover(struct sock *sk, int val);
+void ip_sock_set_pktinfo(struct sock *sk);
 void ip_sock_set_recverr(struct sock *sk);
 void ip_sock_set_tos(struct sock *sk, int val);
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index aa115be11dcf..84ec3703c909 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -608,6 +608,14 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(ip_sock_set_mtu_discover);
 
+void ip_sock_set_pktinfo(struct sock *sk)
+{
+	lock_sock(sk);
+	inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_pktinfo);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6773dacc64d8..7a805d165689 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -616,9 +616,8 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	/* make sure we get destination address info */
 	switch (svsk->sk_sk->sk_family) {
 	case AF_INET:
-		level = SOL_IP;
-		optname = IP_PKTINFO;
-		break;
+		ip_sock_set_pktinfo(svsk->sk_sock->sk);
+		return;
 	case AF_INET6:
 		level = SOL_IPV6;
 		optname = IPV6_RECVPKTINFO;
-- 
cgit v1.2.3


From 9b115749acb24d11083ded4fe947ddd654a940e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:31 +0200
Subject: ipv6: add ip6_sock_set_v6only

Add a helper to directly set the IPV6_V6ONLY sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 11 +++++++++++
 net/ipv6/ip6_udp_tunnel.c |  5 +----
 net/sunrpc/svcsock.c      |  6 +-----
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 39a00d3ef5e2..9b91188c9a74 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1177,4 +1177,15 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
 			  const struct in6_addr *addr, unsigned int mode);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+
+static inline int ip6_sock_set_v6only(struct sock *sk)
+{
+	if (inet_sk(sk)->inet_num)
+		return -EINVAL;
+	lock_sock(sk);
+	sk->sk_ipv6only = true;
+	release_sock(sk);
+	return 0;
+}
+
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 6523609516d2..2e0ad1bc84a8 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -25,10 +25,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->ipv6_v6only) {
-		int val = 1;
-
-		err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
-					(char *) &val, sizeof(val));
+		err = ip6_sock_set_v6only(sock->sk);
 		if (err < 0)
 			goto error;
 	}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7a805d165689..a391892977cd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1328,7 +1328,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	struct sockaddr *newsin = (struct sockaddr *)&addr;
 	int		newlen;
 	int		family;
-	int		val;
 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
 	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1364,11 +1363,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	 * getting requests from IPv4 remotes.  Those should
 	 * be shunted to a PF_INET listener via rpcbind.
 	 */
-	val = 1;
 	if (family == PF_INET6)
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
-					(char *)&val, sizeof(val));
-
+		ip6_sock_set_v6only(sock->sk);
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
 	error = kernel_bind(sock, sin, len);
-- 
cgit v1.2.3


From fce934949c0f0003c1777fbf8c0706ba82a8cf7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:32 +0200
Subject: ipv6: add ip6_sock_set_recverr

Add a helper to directly set the IPV6_RECVERR sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       |  7 +++++++
 net/rxrpc/local_object.c | 10 ++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9b91188c9a74..49c4abf99148 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1188,4 +1188,11 @@ static inline int ip6_sock_set_v6only(struct sock *sk)
 	return 0;
 }
 
+static inline void ip6_sock_set_recverr(struct sock *sk)
+{
+	lock_sock(sk);
+	inet6_sk(sk)->recverr = true;
+	release_sock(sk);
+}
+
 #endif /* _NET_IPV6_H */
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 6f4e6b4817cf..c8b2097f499c 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
 static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 {
 	struct sock *usk;
-	int ret, opt;
+	int ret;
 
 	_enter("%p{%d,%d}",
 	       local, local->srx.transport_type, local->srx.transport.family);
@@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 	switch (local->srx.transport.family) {
 	case AF_INET6:
 		/* we want to receive ICMPv6 errors */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip6_sock_set_recverr(local->socket->sk);
 
 		/* Fall through and set IPv4 options too otherwise we don't get
 		 * errors from IPv4 packets sent through the IPv6 socket.
-- 
cgit v1.2.3


From 18d5ad62327576cbb1e5b9938a59d63ac0c15832 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:33 +0200
Subject: ipv6: add ip6_sock_set_addr_preferences

Add a helper to directly set the IPV6_ADD_PREFERENCES sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ipv6_sockglue.c | 59 +-----------------------------------------
 net/sunrpc/xprtsock.c    |  7 ++---
 3 files changed, 72 insertions(+), 61 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 49c4abf99148..9a9075983016 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1195,4 +1195,71 @@ static inline void ip6_sock_set_recverr(struct sock *sk)
 	release_sock(sk);
 }
 
+static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
+{
+	unsigned int pref = 0;
+	unsigned int prefmask = ~0;
+
+	/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
+	switch (val & (IPV6_PREFER_SRC_PUBLIC |
+		       IPV6_PREFER_SRC_TMP |
+		       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
+	case IPV6_PREFER_SRC_PUBLIC:
+		pref |= IPV6_PREFER_SRC_PUBLIC;
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case IPV6_PREFER_SRC_TMP:
+		pref |= IPV6_PREFER_SRC_TMP;
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check HOME/COA conflicts */
+	switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
+	case IPV6_PREFER_SRC_HOME:
+		prefmask &= ~IPV6_PREFER_SRC_COA;
+		break;
+	case IPV6_PREFER_SRC_COA:
+		pref |= IPV6_PREFER_SRC_COA;
+		break;
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check CGA/NONCGA conflicts */
+	switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
+	case IPV6_PREFER_SRC_CGA:
+	case IPV6_PREFER_SRC_NONCGA:
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
+	return 0;
+}
+
+static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = __ip6_sock_set_addr_preferences(sk, val);
+	release_sock(sk);
+	return ret;
+}
+
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e10258c2210e..adbfed6adf11 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -845,67 +845,10 @@ done:
 		break;
 
 	case IPV6_ADDR_PREFERENCES:
-	    {
-		unsigned int pref = 0;
-		unsigned int prefmask = ~0;
-
 		if (optlen < sizeof(int))
 			goto e_inval;
-
-		retv = -EINVAL;
-
-		/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
-		switch (val & (IPV6_PREFER_SRC_PUBLIC|
-			       IPV6_PREFER_SRC_TMP|
-			       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
-		case IPV6_PREFER_SRC_PUBLIC:
-			pref |= IPV6_PREFER_SRC_PUBLIC;
-			break;
-		case IPV6_PREFER_SRC_TMP:
-			pref |= IPV6_PREFER_SRC_TMP;
-			break;
-		case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
-			break;
-		case 0:
-			goto pref_skip_pubtmp;
-		default:
-			goto e_inval;
-		}
-
-		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
-			      IPV6_PREFER_SRC_TMP);
-pref_skip_pubtmp:
-
-		/* check HOME/COA conflicts */
-		switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
-		case IPV6_PREFER_SRC_HOME:
-			break;
-		case IPV6_PREFER_SRC_COA:
-			pref |= IPV6_PREFER_SRC_COA;
-		case 0:
-			goto pref_skip_coa;
-		default:
-			goto e_inval;
-		}
-
-		prefmask &= ~IPV6_PREFER_SRC_COA;
-pref_skip_coa:
-
-		/* check CGA/NONCGA conflicts */
-		switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
-		case IPV6_PREFER_SRC_CGA:
-		case IPV6_PREFER_SRC_NONCGA:
-		case 0:
-			break;
-		default:
-			goto e_inval;
-		}
-
-		np->srcprefs = (np->srcprefs & prefmask) | pref;
-		retv = 0;
-
+		retv = __ip6_sock_set_addr_preferences(sk, val);
 		break;
-	    }
 	case IPV6_MINHOPCOUNT:
 		if (optlen < sizeof(int))
 			goto e_inval;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0d3ec055bc12..3a143e250b9a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2150,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 	if (!transport->inet) {
 		struct sock *sk = sock->sk;
-		unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
 
 		/* Avoid temporary address, they are bad for long-lived
 		 * connections such as NFS mounts.
@@ -2159,8 +2158,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		 *    knowledge about the normal duration of connections,
 		 *    MAY override this as appropriate.
 		 */
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
-				(char *)&addr_pref, sizeof(addr_pref));
+		if (xs_addr(xprt)->sa_family == PF_INET6) {
+			ip6_sock_set_addr_preferences(sk,
+				IPV6_PREFER_SRC_PUBLIC);
+		}
 
 		xs_tcp_set_socket_timeouts(xprt, sock);
 
-- 
cgit v1.2.3


From 7d7207c2d57080af93fc323dc6a85bd79207b4c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:34 +0200
Subject: ipv6: add ip6_sock_set_recvpktinfo

Add a helper to directly set the IPV6_RECVPKTINFO sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h   |  7 +++++++
 net/sunrpc/svcsock.c | 10 ++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9a9075983016..5e65bf2fd32d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1262,4 +1262,11 @@ static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val)
 	return ret;
 }
 
+static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
+{
+	lock_sock(sk);
+	inet6_sk(sk)->rxopt.bits.rxinfo = true;
+	release_sock(sk);
+}
+
 #endif /* _NET_IPV6_H */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a391892977cd..e7a0037d9b56 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -595,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = {
 
 static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
-	int err, level, optname, one = 1;
-
 	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
 		      &svsk->sk_xprt, serv);
 	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
@@ -617,17 +615,13 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	switch (svsk->sk_sk->sk_family) {
 	case AF_INET:
 		ip_sock_set_pktinfo(svsk->sk_sock->sk);
-		return;
+		break;
 	case AF_INET6:
-		level = SOL_IPV6;
-		optname = IPV6_RECVPKTINFO;
+		ip6_sock_set_recvpktinfo(svsk->sk_sock->sk);
 		break;
 	default:
 		BUG();
 	}
-	err = kernel_setsockopt(svsk->sk_sock, level, optname,
-					(char *)&one, sizeof(one));
-	dprintk("svc: kernel_setsockopt returned %d\n", err);
 }
 
 /*
-- 
cgit v1.2.3


From 298cd88a66a02c899772ffafbf648786ceb5ab95 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:35 +0200
Subject: rxrpc: add rxrpc_sock_set_min_security_level

Add a helper to directly set the RXRPC_MIN_SECURITY_LEVEL sockopt from
kernel space without going through a fake uaccess.

Thanks to David Howells for the documentation updates.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/rxrpc.rst | 13 +++++++++++--
 fs/afs/rxrpc.c                     |  6 ++----
 include/net/af_rxrpc.h             |  2 ++
 net/rxrpc/af_rxrpc.c               | 13 +++++++++++++
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
index 5ad35113d0f4..68552b92dc44 100644
--- a/Documentation/networking/rxrpc.rst
+++ b/Documentation/networking/rxrpc.rst
@@ -477,7 +477,7 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level:
 	 Encrypted checksum plus packet padded and first eight bytes of packet
 	 encrypted - which includes the actual packet length.
 
-     (c) RXRPC_SECURITY_ENCRYPTED
+     (c) RXRPC_SECURITY_ENCRYPT
 
 	 Encrypted checksum plus entire packet padded and encrypted, including
 	 actual packet length.
@@ -578,7 +578,7 @@ A client would issue an operation by:
      This issues a request_key() to get the key representing the security
      context.  The minimum security level can be set::
 
-	unsigned int sec = RXRPC_SECURITY_ENCRYPTED;
+	unsigned int sec = RXRPC_SECURITY_ENCRYPT;
 	setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
 		   &sec, sizeof(sec));
 
@@ -1090,6 +1090,15 @@ The kernel interface functions are as follows:
      jiffies).  In the event of the timeout occurring, the call will be
      aborted and -ETIME or -ETIMEDOUT will be returned.
 
+ (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the
+     kernel::
+
+       int rxrpc_sock_set_min_security_level(struct sock *sk,
+					     unsigned int val);
+
+     This specifies the minimum security level required for calls on this
+     socket.
+
 
 Configurable Parameters
 =======================
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 1ecc67da6c1a..e313dae01674 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -37,7 +37,6 @@ int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
-	unsigned int min_level;
 	int ret;
 
 	_enter("");
@@ -57,9 +56,8 @@ int afs_open_socket(struct afs_net *net)
 	srx.transport.sin6.sin6_family	= AF_INET6;
 	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
-	min_level = RXRPC_SECURITY_ENCRYPT;
-	ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
-				(void *)&min_level, sizeof(min_level));
+	ret = rxrpc_sock_set_min_security_level(socket->sk,
+						RXRPC_SECURITY_ENCRYPT);
 	if (ret < 0)
 		goto error_2;
 
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index ab988940bf04..91eacbdcf33d 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -72,4 +72,6 @@ bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
 void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 			       unsigned long);
 
+int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
+
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 15ee92d79581..394189b81849 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -571,6 +571,19 @@ out:
 	return ret;
 }
 
+int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val)
+{
+	if (sk->sk_state != RXRPC_UNBOUND)
+		return -EISCONN;
+	if (val > RXRPC_SECURITY_MAX)
+		return -EINVAL;
+	lock_sock(sk);
+	rxrpc_sk(sk)->min_sec_level = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_min_security_level);
+
 /*
  * set RxRPC socket options
  */
-- 
cgit v1.2.3


From 095ae612530c9465df6d372d688cb30c6abfc5f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 May 2020 07:12:36 +0200
Subject: tipc: call tsk_set_importance from tipc_topsrv_create_listener

Avoid using kernel_setsockopt for the TIPC_IMPORTANCE option when we can
just use the internal helper.  The only change needed is to pass a struct
sock instead of tipc_sock, which is private to socket.c

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 18 +++++++++---------
 net/tipc/socket.h |  2 ++
 net/tipc/topsrv.c |  6 +++---
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index d6b67d07d22e..3734cdbedc9c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -196,17 +196,17 @@ static int tsk_importance(struct tipc_sock *tsk)
 	return msg_importance(&tsk->phdr);
 }
 
-static int tsk_set_importance(struct tipc_sock *tsk, int imp)
+static struct tipc_sock *tipc_sk(const struct sock *sk)
 {
-	if (imp > TIPC_CRITICAL_IMPORTANCE)
-		return -EINVAL;
-	msg_set_importance(&tsk->phdr, (u32)imp);
-	return 0;
+	return container_of(sk, struct tipc_sock, sk);
 }
 
-static struct tipc_sock *tipc_sk(const struct sock *sk)
+int tsk_set_importance(struct sock *sk, int imp)
 {
-	return container_of(sk, struct tipc_sock, sk);
+	if (imp > TIPC_CRITICAL_IMPORTANCE)
+		return -EINVAL;
+	msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp);
+	return 0;
 }
 
 static bool tsk_conn_cong(struct tipc_sock *tsk)
@@ -2721,7 +2721,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
 	/* Connect new socket to it's peer */
 	tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
 
-	tsk_set_importance(new_tsock, msg_importance(msg));
+	tsk_set_importance(new_sk, msg_importance(msg));
 	if (msg_named(msg)) {
 		new_tsock->conn_type = msg_nametype(msg);
 		new_tsock->conn_instance = msg_nameinst(msg);
@@ -3139,7 +3139,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 
 	switch (opt) {
 	case TIPC_IMPORTANCE:
-		res = tsk_set_importance(tsk, value);
+		res = tsk_set_importance(sk, value);
 		break;
 	case TIPC_SRC_DROPPABLE:
 		if (sock->type != SOCK_STREAM)
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index 235b9679acee..b11575afc66f 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk);
 bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb);
 bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb);
 
+int tsk_set_importance(struct sock *sk, int imp);
+
 #endif
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 446af7bbd13e..1489cfb941d8 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -497,7 +497,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk)
 
 static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
 {
-	int imp = TIPC_CRITICAL_IMPORTANCE;
 	struct socket *lsock = NULL;
 	struct sockaddr_tipc saddr;
 	struct sock *sk;
@@ -514,8 +513,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
 	sk->sk_user_data = srv;
 	write_unlock_bh(&sk->sk_callback_lock);
 
-	rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE,
-			       (char *)&imp, sizeof(imp));
+	lock_sock(sk);
+	rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE);
+	release_sock(sk);
 	if (rc < 0)
 		goto err;
 
-- 
cgit v1.2.3


From e6da0edc24eecef2f6964d92fa9044e1821deace Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 28 May 2020 14:35:12 +0200
Subject: Bluetooth: Acquire sk_lock.slock without disabling interrupts

There was a lockdep which led to commit
   fad003b6c8e3d ("Bluetooth: Fix inconsistent lock state with RFCOMM")

Lockdep noticed that `sk->sk_lock.slock' was acquired without disabling
the softirq while the lock was also used in softirq context.
Unfortunately the solution back then was to disable interrupts before
acquiring the lock which however made lockdep happy.
It would have been enough to simply disable the softirq. Disabling
interrupts before acquiring a spinlock_t is not allowed on PREEMPT_RT
because these locks are converted to 'sleeping' spinlocks.

Use spin_lock_bh() in order to acquire the `sk_lock.slock'.

Reported-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Reported-by: kbuild test robot <lkp@intel.com> [missing unlock]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/rfcomm/sock.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index b4eaf21360ef..df14eebe80da 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -64,15 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
 static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
 {
 	struct sock *sk = d->owner, *parent;
-	unsigned long flags;
 
 	if (!sk)
 		return;
 
 	BT_DBG("dlc %p state %ld err %d", d, d->state, err);
 
-	local_irq_save(flags);
-	bh_lock_sock(sk);
+	spin_lock_bh(&sk->sk_lock.slock);
 
 	if (err)
 		sk->sk_err = err;
@@ -93,8 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
 		sk->sk_state_change(sk);
 	}
 
-	bh_unlock_sock(sk);
-	local_irq_restore(flags);
+	spin_unlock_bh(&sk->sk_lock.slock);
 
 	if (parent && sock_flag(sk, SOCK_ZAPPED)) {
 		/* We have to drop DLC lock here, otherwise
-- 
cgit v1.2.3


From 05bfd3661448a46db3a258b316160d34cf0a1317 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 29 May 2020 14:09:41 +0200
Subject: sctp: refactor sctp_setsockopt_bindx

Split out a sctp_setsockopt_bindx_kernel that takes a kernel pointer
to the sockaddr and make sctp_setsockopt_bindx a small wrapper around
it.  This prepares for adding a new bind_add proto op.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 61 +++++++++++++++++++++++++------------------------------
 1 file changed, 28 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 827a9903ee28..6e745ac3c4a5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -972,23 +972,22 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
  * it.
  *
  * sk        The sk of the socket
- * addrs     The pointer to the addresses in user land
+ * addrs     The pointer to the addresses
  * addrssize Size of the addrs buffer
  * op        Operation to perform (add or remove, see the flags of
  *           sctp_bindx)
  *
  * Returns 0 if ok, <0 errno code on error.
  */
-static int sctp_setsockopt_bindx(struct sock *sk,
-				 struct sockaddr __user *addrs,
-				 int addrs_size, int op)
+static int sctp_setsockopt_bindx_kernel(struct sock *sk,
+					struct sockaddr *addrs, int addrs_size,
+					int op)
 {
-	struct sockaddr *kaddrs;
 	int err;
 	int addrcnt = 0;
 	int walk_size = 0;
 	struct sockaddr *sa_addr;
-	void *addr_buf;
+	void *addr_buf = addrs;
 	struct sctp_af *af;
 
 	pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n",
@@ -997,17 +996,10 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 	if (unlikely(addrs_size <= 0))
 		return -EINVAL;
 
-	kaddrs = memdup_user(addrs, addrs_size);
-	if (IS_ERR(kaddrs))
-		return PTR_ERR(kaddrs);
-
 	/* Walk through the addrs buffer and count the number of addresses. */
-	addr_buf = kaddrs;
 	while (walk_size < addrs_size) {
-		if (walk_size + sizeof(sa_family_t) > addrs_size) {
-			kfree(kaddrs);
+		if (walk_size + sizeof(sa_family_t) > addrs_size)
 			return -EINVAL;
-		}
 
 		sa_addr = addr_buf;
 		af = sctp_get_af_specific(sa_addr->sa_family);
@@ -1015,10 +1007,8 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 		/* If the address family is not supported or if this address
 		 * causes the address buffer to overflow return EINVAL.
 		 */
-		if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
-			kfree(kaddrs);
+		if (!af || (walk_size + af->sockaddr_len) > addrs_size)
 			return -EINVAL;
-		}
 		addrcnt++;
 		addr_buf += af->sockaddr_len;
 		walk_size += af->sockaddr_len;
@@ -1029,31 +1019,36 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 	case SCTP_BINDX_ADD_ADDR:
 		/* Allow security module to validate bindx addresses. */
 		err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD,
-						 (struct sockaddr *)kaddrs,
-						 addrs_size);
+						 addrs, addrs_size);
 		if (err)
-			goto out;
-		err = sctp_bindx_add(sk, kaddrs, addrcnt);
+			return err;
+		err = sctp_bindx_add(sk, addrs, addrcnt);
 		if (err)
-			goto out;
-		err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt);
-		break;
-
+			return err;
+		return sctp_send_asconf_add_ip(sk, addrs, addrcnt);
 	case SCTP_BINDX_REM_ADDR:
-		err = sctp_bindx_rem(sk, kaddrs, addrcnt);
+		err = sctp_bindx_rem(sk, addrs, addrcnt);
 		if (err)
-			goto out;
-		err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt);
-		break;
+			return err;
+		return sctp_send_asconf_del_ip(sk, addrs, addrcnt);
 
 	default:
-		err = -EINVAL;
-		break;
+		return -EINVAL;
 	}
+}
 
-out:
-	kfree(kaddrs);
+static int sctp_setsockopt_bindx(struct sock *sk,
+				 struct sockaddr __user *addrs,
+				 int addrs_size, int op)
+{
+	struct sockaddr *kaddrs;
+	int err;
 
+	kaddrs = memdup_user(addrs, addrs_size);
+	if (IS_ERR(kaddrs))
+		return PTR_ERR(kaddrs);
+	err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op);
+	kfree(kaddrs);
 	return err;
 }
 
-- 
cgit v1.2.3


From c0425a4249e9d313eec5f81c0bde8a286ebf9a63 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 29 May 2020 14:09:42 +0200
Subject: net: add a new bind_add method

The SCTP protocol allows to bind multiple address to a socket.  That
feature is currently only exposed as a socket option.  Add a bind_add
method struct proto that allows to bind additional addresses, and
switch the dlm code to use the method instead of going through the
socket option from kernel space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c  |  9 +++------
 include/net/sock.h |  6 +++++-
 net/core/sock.c    |  8 ++++++++
 net/sctp/socket.c  | 14 ++++++++++++++
 4 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9f1c3cdc9d65..3543a8fec907 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -882,6 +882,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
 static int sctp_bind_addrs(struct connection *con, uint16_t port)
 {
 	struct sockaddr_storage localaddr;
+	struct sockaddr *addr = (struct sockaddr *)&localaddr;
 	int i, addr_len, result = 0;
 
 	for (i = 0; i < dlm_local_count; i++) {
@@ -889,13 +890,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port)
 		make_sockaddr(&localaddr, port, &addr_len);
 
 		if (!i)
-			result = kernel_bind(con->sock,
-					     (struct sockaddr *)&localaddr,
-					     addr_len);
+			result = kernel_bind(con->sock, addr, addr_len);
 		else
-			result = kernel_setsockopt(con->sock, SOL_SCTP,
-						   SCTP_SOCKOPT_BINDX_ADD,
-						   (char *)&localaddr, addr_len);
+			result = sock_bind_add(con->sock->sk, addr, addr_len);
 
 		if (result < 0) {
 			log_print("Can't bind to %d addr number %d, %d.\n",
diff --git a/include/net/sock.h b/include/net/sock.h
index d994daa418ec..6e9f713a7860 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1156,7 +1156,9 @@ struct proto {
 	int			(*sendpage)(struct sock *sk, struct page *page,
 					int offset, size_t size, int flags);
 	int			(*bind)(struct sock *sk,
-					struct sockaddr *uaddr, int addr_len);
+					struct sockaddr *addr, int addr_len);
+	int			(*bind_add)(struct sock *sk,
+					struct sockaddr *addr, int addr_len);
 
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
@@ -2698,4 +2700,6 @@ void sock_set_reuseaddr(struct sock *sk);
 void sock_set_reuseport(struct sock *sk);
 void sock_set_sndtimeo(struct sock *sk, s64 secs);
 
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 2ca3425b519c..61ec573221a6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3712,3 +3712,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
 }
 EXPORT_SYMBOL(sk_busy_loop_end);
 #endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+	if (!sk->sk_prot->bind_add)
+		return -EOPNOTSUPP;
+	return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6e745ac3c4a5..d57e1a002ffc 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1052,6 +1052,18 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 	return err;
 }
 
+static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs,
+		int addrlen)
+{
+	int err;
+
+	lock_sock(sk);
+	err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen,
+					   SCTP_BINDX_ADD_ADDR);
+	release_sock(sk);
+	return err;
+}
+
 static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
 				 const union sctp_addr *daddr,
 				 const struct sctp_initmsg *init,
@@ -9620,6 +9632,7 @@ struct proto sctp_prot = {
 	.sendmsg     =	sctp_sendmsg,
 	.recvmsg     =	sctp_recvmsg,
 	.bind        =	sctp_bind,
+	.bind_add    =  sctp_bind_add,
 	.backlog_rcv =	sctp_backlog_rcv,
 	.hash        =	sctp_hash,
 	.unhash      =	sctp_unhash,
@@ -9662,6 +9675,7 @@ struct proto sctpv6_prot = {
 	.sendmsg	= sctp_sendmsg,
 	.recvmsg	= sctp_recvmsg,
 	.bind		= sctp_bind,
+	.bind_add	= sctp_bind_add,
 	.backlog_rcv	= sctp_backlog_rcv,
 	.hash		= sctp_hash,
 	.unhash		= sctp_unhash,
-- 
cgit v1.2.3


From 5a892ff2facb4548c17c05931ed899038a0da63e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 29 May 2020 14:09:43 +0200
Subject: net: remove kernel_setsockopt

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  2 --
 net/socket.c        | 31 -------------------------------
 2 files changed, 33 deletions(-)

(limited to 'net')

diff --git a/include/linux/net.h b/include/linux/net.h
index 74ef5d7315f7..e10f378194a5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 		   int flags);
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr);
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr);
-int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval,
-		      unsigned int optlen);
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags);
 int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/net/socket.c b/net/socket.c
index 81a98b6cbd08..976426d03f09 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3624,37 +3624,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(kernel_getpeername);
 
-/**
- *	kernel_setsockopt - set a socket option (kernel space)
- *	@sock: socket
- *	@level: API level (SOL_SOCKET, ...)
- *	@optname: option tag
- *	@optval: option value
- *	@optlen: option length
- *
- *	Returns 0 or an error.
- */
-
-int kernel_setsockopt(struct socket *sock, int level, int optname,
-			char *optval, unsigned int optlen)
-{
-	mm_segment_t oldfs = get_fs();
-	char __user *uoptval;
-	int err;
-
-	uoptval = (char __user __force *) optval;
-
-	set_fs(KERNEL_DS);
-	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
-	else
-		err = sock->ops->setsockopt(sock, level, optname, uoptval,
-					    optlen);
-	set_fs(oldfs);
-	return err;
-}
-EXPORT_SYMBOL(kernel_setsockopt);
-
 /**
  *	kernel_sendpage - send a &page through a socket (kernel space)
  *	@sock: socket
-- 
cgit v1.2.3


From 04198499b23f9d73a127a17b8576c9266c8f6f9b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 27 May 2020 19:41:34 +0300
Subject: net: dsa: tag_8021q: stop restoring VLANs from bridge

Right now, our only tag_8021q user, sja1105, has the ability to restore
bridge VLANs on its own, so this logic is unnecessary.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_8021q.c | 61 +----------------------------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 3052da668156..780b2a15ac9b 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -140,34 +140,6 @@ bool vid_is_dsa_8021q(u16 vid)
 }
 EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
 
-static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
-{
-	struct bridge_vlan_info vinfo;
-	struct net_device *slave;
-	u16 pvid;
-	int err;
-
-	if (!dsa_is_user_port(ds, port))
-		return 0;
-
-	slave = dsa_to_port(ds, port)->slave;
-
-	err = br_vlan_get_pvid(slave, &pvid);
-	if (!pvid || err < 0)
-		/* There is no pvid on the bridge for this port, which is
-		 * perfectly valid. Nothing to restore, bye-bye!
-		 */
-		return 0;
-
-	err = br_vlan_get_info(slave, pvid, &vinfo);
-	if (err < 0) {
-		dev_err(ds->dev, "Couldn't determine PVID attributes\n");
-		return err;
-	}
-
-	return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags);
-}
-
 /* If @enabled is true, installs @vid with @flags into the switch port's HW
  * filter.
  * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the
@@ -178,39 +150,11 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
 			       u16 flags, bool enabled)
 {
 	struct dsa_port *dp = dsa_to_port(ds, port);
-	struct bridge_vlan_info vinfo;
-	int err;
 
 	if (enabled)
 		return dsa_port_vid_add(dp, vid, flags);
 
-	err = dsa_port_vid_del(dp, vid);
-	if (err < 0)
-		return err;
-
-	/* Nothing to restore from the bridge for a non-user port.
-	 * The CPU port VLANs are restored implicitly with the user ports,
-	 * similar to how the bridge does in dsa_slave_vlan_add and
-	 * dsa_slave_vlan_del.
-	 */
-	if (!dsa_is_user_port(ds, port))
-		return 0;
-
-	err = br_vlan_get_info(dp->slave, vid, &vinfo);
-	/* Couldn't determine bridge attributes for this vid,
-	 * it means the bridge had not configured it.
-	 */
-	if (err < 0)
-		return 0;
-
-	/* Restore the VID from the bridge */
-	err = dsa_port_vid_add(dp, vid, vinfo.flags);
-	if (err < 0)
-		return err;
-
-	vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID;
-
-	return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags);
+	return dsa_port_vid_del(dp, vid);
 }
 
 /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
@@ -329,9 +273,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
 		return err;
 	}
 
-	if (!enabled)
-		err = dsa_8021q_restore_pvid(ds, port);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
-- 
cgit v1.2.3


From 8298a419a0064c6cd6c84a4a45de294a3eff0832 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 28 May 2020 07:43:59 +0000
Subject: tipc: remove set but not used variable 'prev'

Fixes gcc '-Wunused-but-set-variable' warning:

net/tipc/msg.c: In function 'tipc_msg_append':
net/tipc/msg.c:215:24: warning:
 variable 'prev' set but not used [-Wunused-but-set-variable]

commit 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness")
left behind this, remove it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/msg.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 23809039dda1..c0afcd627c5e 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -212,7 +212,7 @@ err:
 int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
 		    int mss, struct sk_buff_head *txq)
 {
-	struct sk_buff *skb, *prev;
+	struct sk_buff *skb;
 	int accounted, total, curr;
 	int mlen, cpy, rem = dlen;
 	struct tipc_msg *hdr;
@@ -223,7 +223,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
 
 	while (rem) {
 		if (!skb || skb->len >= mss) {
-			prev = skb;
 			skb = tipc_buf_acquire(mss, GFP_KERNEL);
 			if (unlikely(!skb))
 				return -ENOMEM;
-- 
cgit v1.2.3


From fd55199d3b762f9555c66a1bc4bf6eac3901fc2f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 28 May 2020 23:43:24 +0200
Subject: net: ethtool: cabletest: Make ethnl_act_cable_test_tdr_cfg static

kbuild test robot is reporting:
net/ethtool/cabletest.c:230:5: warning: no previous prototype for

Mark the function as static.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/cabletest.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 9991688d7d1d..7b7a0456c15c 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -227,9 +227,9 @@ cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = {
 };
 
 /* CABLE_TEST_TDR_ACT */
-int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
-				 struct genl_info *info,
-				 struct phy_tdr_config *cfg)
+static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
+					struct genl_info *info,
+					struct phy_tdr_config *cfg)
 {
 	struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1];
 	int ret;
-- 
cgit v1.2.3


From bc183dec08f9cb177cf5206a010b7a9e7b22e567 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 May 2020 00:01:52 +0200
Subject: tcp: tcp_init_buffer_space can be static

As of commit 98fa6271cfcb
("tcp: refactor setting the initial congestion window") this is called
only from tcp_input.c, so it can be static.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 1 -
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 66e4b8331850..bca761ffa25f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -662,7 +662,6 @@ void tcp_initialize_rcv_mss(struct sock *sk);
 int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 int tcp_mss_to_mtu(struct sock *sk, int mss);
 void tcp_mtup_init(struct sock *sk);
-void tcp_init_buffer_space(struct sock *sk);
 
 static inline void tcp_bound_rto(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ad90102f5dfb..83330a6cb242 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -437,7 +437,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 /* 3. Try to fixup all. It is made immediately after connection enters
  *    established state.
  */
-void tcp_init_buffer_space(struct sock *sk)
+static void tcp_init_buffer_space(struct sock *sk)
 {
 	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.3


From 86ae579cefffe18cb08928505d90fbc87367e8f5 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Wed, 27 May 2020 08:35:03 +0300
Subject: net: Make mpls_entry_encode() available for generic users

Move mpls_entry_encode() from net/mpls/internal.h to include/net/mpls.h
and make it available for other users. Specifically, hardware driver that
offload MPLS can benefit from that.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/mpls.h  | 17 +++++++++++++++++
 net/mpls/internal.h | 11 -----------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/mpls.h b/include/net/mpls.h
index ccaf238e8ea7..0bb7944e7b08 100644
--- a/include/net/mpls.h
+++ b/include/net/mpls.h
@@ -8,6 +8,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/netdevice.h>
+#include <linux/mpls.h>
 
 #define MPLS_HLEN 4
 
@@ -25,4 +26,20 @@ static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
 {
 	return (struct mpls_shim_hdr *)skb_network_header(skb);
 }
+
+static inline struct mpls_shim_hdr mpls_entry_encode(u32 label,
+						     unsigned int ttl,
+						     unsigned int tc,
+						     bool bos)
+{
+	struct mpls_shim_hdr result;
+
+	result.label_stack_entry =
+		cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) |
+			    (tc << MPLS_LS_TC_SHIFT) |
+			    (bos ? (1 << MPLS_LS_S_SHIFT) : 0) |
+			    (ttl << MPLS_LS_TTL_SHIFT));
+	return result;
+}
+
 #endif
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 0e9aa94adc07..838cdfc10e47 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -172,17 +172,6 @@ struct mpls_route { /* next hop label forwarding entry */
 
 #define endfor_nexthops(rt) }
 
-static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
-{
-	struct mpls_shim_hdr result;
-	result.label_stack_entry =
-		cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) |
-			    (tc << MPLS_LS_TC_SHIFT) |
-			    (bos ? (1 << MPLS_LS_S_SHIFT) : 0) |
-			    (ttl << MPLS_LS_TTL_SHIFT));
-	return result;
-}
-
 static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr)
 {
 	struct mpls_entry_decoded result;
-- 
cgit v1.2.3


From b8ded9de8db34dd209a3dece94cf54fc414e78f7 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Sat, 30 May 2020 16:42:37 +0200
Subject: net/smc: pre-fetch send buffer outside of send_lock

Pre-fetch send buffer for the CDC validation message before entering the
send_lock. Without that the send call might fail with -EBUSY because
there are no free buffers and waiting for buffers is not possible under
send_lock.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c  | 10 +++-------
 net/smc/smc_cdc.h  |  4 +++-
 net/smc/smc_core.c | 18 +++++++++++++++---
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b2b85e1be72c..a47e8855e045 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -116,19 +116,15 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 }
 
 /* send a validation msg indicating the move of a conn to an other QP link */
-int smcr_cdc_msg_send_validation(struct smc_connection *conn)
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+				 struct smc_cdc_tx_pend *pend,
+				 struct smc_wr_buf *wr_buf)
 {
 	struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
 	struct smc_link *link = conn->lnk;
-	struct smc_cdc_tx_pend *pend;
-	struct smc_wr_buf *wr_buf;
 	struct smc_cdc_msg *peer;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
-	if (rc)
-		return rc;
-
 	peer = (struct smc_cdc_msg *)wr_buf;
 	peer->common.type = local->common.type;
 	peer->len = local->len;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 2ddcc5fb5ceb..0a0a89abd38b 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -296,7 +296,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend);
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
 int smcd_cdc_msg_send(struct smc_connection *conn);
-int smcr_cdc_msg_send_validation(struct smc_connection *conn);
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+				 struct smc_cdc_tx_pend *pend,
+				 struct smc_wr_buf *wr_buf);
 int smc_cdc_init(void) __init;
 void smcd_cdc_rx_init(struct smc_connection *conn);
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 65de700e1f17..7964a21e5e6f 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -483,7 +483,8 @@ static int smc_write_space(struct smc_connection *conn)
 	return space;
 }
 
-static int smc_switch_cursor(struct smc_sock *smc)
+static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
+			     struct smc_wr_buf *wr_buf)
 {
 	struct smc_connection *conn = &smc->conn;
 	union smc_host_cursor cons, fin;
@@ -520,11 +521,14 @@ static int smc_switch_cursor(struct smc_sock *smc)
 
 	if (smc->sk.sk_state != SMC_INIT &&
 	    smc->sk.sk_state != SMC_CLOSED) {
-		rc = smcr_cdc_msg_send_validation(conn);
+		rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
 		if (!rc) {
 			schedule_delayed_work(&conn->tx_work, 0);
 			smc->sk.sk_data_ready(&smc->sk);
 		}
+	} else {
+		smc_wr_tx_put_slot(conn->lnk,
+				   (struct smc_wr_tx_pend_priv *)pend);
 	}
 	return rc;
 }
@@ -533,7 +537,9 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
 				  struct smc_link *from_lnk, bool is_dev_err)
 {
 	struct smc_link *to_lnk = NULL;
+	struct smc_cdc_tx_pend *pend;
 	struct smc_connection *conn;
+	struct smc_wr_buf *wr_buf;
 	struct smc_sock *smc;
 	struct rb_node *node;
 	int i, rc = 0;
@@ -582,10 +588,16 @@ again:
 		}
 		sock_hold(&smc->sk);
 		read_unlock_bh(&lgr->conns_lock);
+		/* pre-fetch buffer outside of send_lock, might sleep */
+		rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
+		if (rc) {
+			smcr_link_down_cond_sched(to_lnk);
+			return NULL;
+		}
 		/* avoid race with smcr_tx_sndbuf_nonempty() */
 		spin_lock_bh(&conn->send_lock);
 		conn->lnk = to_lnk;
-		rc = smc_switch_cursor(smc);
+		rc = smc_switch_cursor(smc, pend, wr_buf);
 		spin_unlock_bh(&conn->send_lock);
 		sock_put(&smc->sk);
 		if (rc) {
-- 
cgit v1.2.3


From b0c19ed6088ab41dd2a727b60594b7297c15d6ce Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 29 May 2020 14:43:44 +0200
Subject: sch_cake: Take advantage of skb->hash where appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While the other fq-based qdiscs take advantage of skb->hash and doesn't
recompute it if it is already set, sch_cake does not.

This was a deliberate choice because sch_cake hashes various parts of the
packet header to support its advanced flow isolation modes. However,
foregoing the use of skb->hash entirely loses a few important benefits:

- When skb->hash is set by hardware, a few CPU cycles can be saved by not
  hashing again in software.

- Tunnel encapsulations will generally preserve the value of skb->hash from
  before the encapsulation, which allows flow-based qdiscs to distinguish
  between flows even though the outer packet header no longer has flow
  information.

It turns out that we can preserve these desirable properties in many cases,
while still supporting the advanced flow isolation properties of sch_cake.
This patch does so by reusing the skb->hash value as the flow_hash part of
the hashing procedure in cake_hash() only in the following conditions:

- If the skb->hash is marked as covering the flow headers (skb->l4_hash is
  set)

AND

- NAT header rewriting is either disabled, or did not change any values
  used for hashing. The latter is important to match local-origin packets
  such as those of a tunnel endpoint.

The immediate motivation for fixing this was the recent patch to WireGuard
to preserve the skb->hash on encapsulation. As such, this is also what I
tested against; with this patch, added latency under load for competing
flows drops from ~8 ms to sub-1ms on an RRUL test over a WireGuard tunnel
going through a virtual link shaped to 1Gbps using sch_cake. This matches
the results we saw with a similar setup using sch_fq_codel when testing the
WireGuard patch.

Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cake.c | 65 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 1496e87cd07b..60f8ae578819 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -584,26 +584,48 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
 	return drop;
 }
 
-static void cake_update_flowkeys(struct flow_keys *keys,
+static bool cake_update_flowkeys(struct flow_keys *keys,
 				 const struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	struct nf_conntrack_tuple tuple = {};
-	bool rev = !skb->_nfct;
+	bool rev = !skb->_nfct, upd = false;
+	__be32 ip;
 
 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
-		return;
+		return false;
 
 	if (!nf_ct_get_tuple_skb(&tuple, skb))
-		return;
+		return false;
 
-	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
-	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+	if (ip != keys->addrs.v4addrs.src) {
+		keys->addrs.v4addrs.src = ip;
+		upd = true;
+	}
+	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+	if (ip != keys->addrs.v4addrs.dst) {
+		keys->addrs.v4addrs.dst = ip;
+		upd = true;
+	}
 
 	if (keys->ports.ports) {
-		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
-		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+		__be16 port;
+
+		port = rev ? tuple.dst.u.all : tuple.src.u.all;
+		if (port != keys->ports.src) {
+			keys->ports.src = port;
+			upd = true;
+		}
+		port = rev ? tuple.src.u.all : tuple.dst.u.all;
+		if (port != keys->ports.dst) {
+			port = keys->ports.dst;
+			upd = true;
+		}
 	}
+	return upd;
+#else
+	return false;
 #endif
 }
 
@@ -624,23 +646,36 @@ static bool cake_ddst(int flow_mode)
 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 		     int flow_mode, u16 flow_override, u16 host_override)
 {
+	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
+	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
+	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 	u16 reduced_hash, srchost_idx, dsthost_idx;
 	struct flow_keys keys, host_keys;
+	bool use_skbhash = skb->l4_hash;
 
 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
 		return 0;
 
-	/* If both overrides are set we can skip packet dissection entirely */
-	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
-	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
+	/* If both overrides are set, or we can use the SKB hash and nat mode is
+	 * disabled, we can skip packet dissection entirely. If nat mode is
+	 * enabled there's another check below after doing the conntrack lookup.
+	 */
+	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 		goto skip_hash;
 
 	skb_flow_dissect_flow_keys(skb, &keys,
 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 
-	if (flow_mode & CAKE_FLOW_NAT_FLAG)
-		cake_update_flowkeys(&keys, skb);
+	/* Don't use the SKB hash if we change the lookup keys from conntrack */
+	if (nat_enabled && cake_update_flowkeys(&keys, skb))
+		use_skbhash = false;
+
+	/* If we can still use the SKB hash and don't need the host hash, we can
+	 * skip the rest of the hashing procedure
+	 */
+	if (use_skbhash && !hash_hosts)
+		goto skip_hash;
 
 	/* flow_hash_from_keys() sorts the addresses by value, so we have
 	 * to preserve their order in a separate data structure to treat
@@ -679,12 +714,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 	/* This *must* be after the above switch, since as a
 	 * side-effect it sorts the src and dst addresses.
 	 */
-	if (flow_mode & CAKE_FLOW_FLOWS)
+	if (hash_flows && !use_skbhash)
 		flow_hash = flow_hash_from_keys(&keys);
 
 skip_hash:
 	if (flow_override)
 		flow_hash = flow_override - 1;
+	else if (use_skbhash)
+		flow_hash = skb->hash;
 	if (host_override) {
 		dsthost_hash = host_override - 1;
 		srchost_hash = host_override - 1;
-- 
cgit v1.2.3


From 39884604b11692158ce0c559fc603510b96f8c2e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 29 May 2020 17:49:18 +0200
Subject: mptcp: fix NULL ptr dereference in MP_JOIN error path

When token lookup on MP_JOIN 3rd ack fails, the server
socket closes with a reset the incoming child. Such socket
has the 'is_mptcp' flag set, but no msk socket associated
- due to the failed lookup.

While crafting the reset packet mptcp_established_options_mp()
will try to dereference the child's master socket, causing
a NULL ptr dereference.

This change addresses the issue with explicit fallback to
TCP in such error path.

Fixes: 729cd6436f35 ("mptcp: cope better with MP_JOIN failure")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/subflow.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index f3c06b8af92d..493b98a0825c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -413,6 +413,20 @@ static void subflow_ulp_fallback(struct sock *sk,
 	tcp_sk(sk)->is_mptcp = 0;
 }
 
+static void subflow_drop_ctx(struct sock *ssk)
+{
+	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+
+	if (!ctx)
+		return;
+
+	subflow_ulp_fallback(ssk, ctx);
+	if (ctx->conn)
+		sock_put(ctx->conn);
+
+	kfree_rcu(ctx, rcu);
+}
+
 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 					  struct sk_buff *skb,
 					  struct request_sock *req,
@@ -485,10 +499,7 @@ create_child:
 			if (fallback_is_fatal)
 				goto dispose_child;
 
-			if (ctx) {
-				subflow_ulp_fallback(child, ctx);
-				kfree_rcu(ctx, rcu);
-			}
+			subflow_drop_ctx(child);
 			goto out;
 		}
 
@@ -537,6 +548,7 @@ out:
 	return child;
 
 dispose_child:
+	subflow_drop_ctx(child);
 	tcp_rsk(req)->drop_req = true;
 	tcp_send_active_reset(child, GFP_ATOMIC);
 	inet_csk_prepare_for_destroy_sock(child);
-- 
cgit v1.2.3


From 5e9cf0f0a3e98992184442de24253fe1b9c40f2e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 29 May 2020 14:04:27 +0200
Subject: cfg80211: fix 6 GHz frequencies to kHz

The updates to change to kHz frequencies and the 6 GHz
additions evidently overlapped (or rather, I didn't see
it when applying the latter), so the 6 GHz is broken.
Fix this.

Fixes: 934f4c7dd3a5 ("cfg80211: express channels with a KHz component")
Link: https://lore.kernel.org/r/20200529140425.1bf824f6911b.I4a1174916b8f5965af4366999eb9ffc7a0347470@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index df75e58eca5d..5b3b0d1222a2 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -94,7 +94,7 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
 	case NL80211_BAND_6GHZ:
 		/* see 802.11ax D4.1 27.3.22.2 */
 		if (chan <= 253)
-			return 5940 + chan * 5;
+			return MHZ_TO_KHZ(5940 + chan * 5);
 		break;
 	case NL80211_BAND_60GHZ:
 		if (chan < 7)
-- 
cgit v1.2.3


From d1a1646c0de7b0083d42a1ada72a3ec243bfcc6d Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 29 May 2020 11:41:43 +0200
Subject: cfg80211: adapt to new channelization of the 6GHz band

The 6GHz band does not have regulatory approval yet, but things are
moving forward. However, that has led to a change in the channelization
of the 6GHz band which has been accepted in the 11ax specification. It
also fixes a missing MHZ_TO_KHZ() macro for 6GHz channels while at it.

This change is primarily thrown in to discuss how to deal with it.
I noticed ath11k adding 6G support with old channelization and ditto
for iw. It probably involves changes in hostapd as well.

Cc: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Cc: Jouni Malinen <jouni@w1.fi>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Link: https://lore.kernel.org/r/edf07cdd-ad15-4012-3afd-d8b961a80b69@broadcom.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 5b3b0d1222a2..a27d4f45fb5f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -92,9 +92,11 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
 			return MHZ_TO_KHZ(5000 + chan * 5);
 		break;
 	case NL80211_BAND_6GHZ:
-		/* see 802.11ax D4.1 27.3.22.2 */
+		/* see 802.11ax D6.1 27.3.23.2 */
+		if (chan == 2)
+			return MHZ_TO_KHZ(5935);
 		if (chan <= 253)
-			return MHZ_TO_KHZ(5940 + chan * 5);
+			return MHZ_TO_KHZ(5950 + chan * 5);
 		break;
 	case NL80211_BAND_60GHZ:
 		if (chan < 7)
-- 
cgit v1.2.3


From 0e47901d78f0b91901f845c2fc575ae48d8ed395 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:24 +0200
Subject: nl80211: really allow client-only BIGTK support

My previous commit here was wrong, it didn't check the new
flag in two necessary places, so things didn't work. Fix that.

Fixes: 155d7c733807 ("nl80211: allow client-only BIGTK support")
Link: https://lore.kernel.org/r/20200528213443.993f108e96ca.I0086ae42d672379380d04ac5effb2f3d5135731b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sme.c  | 7 +++++--
 net/wireless/util.c | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 3554c0d951f4..15595cf401de 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -5,7 +5,7 @@
  * (for nl80211's connect() and wext)
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009   Intel Corporation. All rights reserved.
+ * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved.
  * Copyright 2017	Intel Deutschland GmbH
  */
 
@@ -1118,7 +1118,10 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 		if (wiphy_ext_feature_isset(
 			    wdev->wiphy,
-			    NL80211_EXT_FEATURE_BEACON_PROTECTION))
+			    NL80211_EXT_FEATURE_BEACON_PROTECTION) ||
+		    wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
 			max_key_idx = 7;
 		for (i = 0; i <= max_key_idx; i++)
 			rdev_del_key(rdev, dev, i, false, NULL);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index a27d4f45fb5f..4d3b76f94f55 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -242,7 +242,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 	int max_key_idx = 5;
 
 	if (wiphy_ext_feature_isset(&rdev->wiphy,
-				    NL80211_EXT_FEATURE_BEACON_PROTECTION))
+				    NL80211_EXT_FEATURE_BEACON_PROTECTION) ||
+	    wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
 		max_key_idx = 7;
 	if (key_idx < 0 || key_idx > max_key_idx)
 		return -EINVAL;
-- 
cgit v1.2.3


From 43e64bf301fd8c54f0082d91c6ffd4de861baf96 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 28 May 2020 21:34:29 +0200
Subject: cfg80211: handle 6 GHz capability of new station

Handle 6 GHz HE capability while adding new station. It will be used
later in mac80211 station processing.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1589399105-25472-2-git-send-email-rmanohar@codeaurora.org
[handle nl80211_set_station, require WME,
 remove NL80211_HE_6GHZ_CAPABILITY_LEN]
Link: https://lore.kernel.org/r/20200528213443.b6b711fd4312.Ic9b97d57b6c4f2b28d4b2d23d2849d8bc20bd8cc@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  5 +++++
 net/wireless/nl80211.c       | 18 +++++++++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a38653358885..da734ea71b5a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1238,6 +1238,7 @@ struct sta_txpwr {
  * @he_capa_len: the length of the HE capabilities
  * @airtime_weight: airtime scheduler weight for this station
  * @txpwr: transmit power for an associated station
+ * @he_6ghz_capa: HE 6 GHz Band capabilities of station
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -1270,6 +1271,7 @@ struct station_parameters {
 	u8 he_capa_len;
 	u16 airtime_weight;
 	struct sta_txpwr txpwr;
+	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c14666b75e57..e42ae429383e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2502,6 +2502,9 @@ enum nl80211_commands {
  *	first channel segment specified in %NL80211_ATTR_CENTER_FREQ1.
  * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies
  *
+ * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from
+ *	association request when used with NL80211_CMD_NEW_STATION).
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2982,6 +2985,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_CENTER_FREQ1_OFFSET,
 	NL80211_ATTR_SCAN_FREQ_KHZ,
 
+	NL80211_ATTR_HE_6GHZ_CAPABILITY,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 22c4d13e28cb..bf8bd8268cb7 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -654,6 +654,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
 	[NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
 	[NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED },
+	[NL80211_ATTR_HE_6GHZ_CAPABILITY] = {
+		.type = NLA_EXACT_LEN,
+		.len = sizeof(struct ieee80211_he_6ghz_capa),
+	},
 };
 
 /* policy for the key attributes */
@@ -5989,6 +5993,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
 	}
 
+	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
+		params.he_6ghz_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
 	if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
 		params.airtime_weight =
 			nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);
@@ -6123,6 +6131,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
+	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
+		params.he_6ghz_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
+
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
 		params.opmode_notif_used = true;
 		params.opmode_notif =
@@ -6167,10 +6179,14 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.vht_capa = NULL;
 
 		/* HE requires WME */
-		if (params.he_capa_len)
+		if (params.he_capa_len || params.he_6ghz_capa)
 			return -EINVAL;
 	}
 
+	/* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */
+	if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa))
+		return -EINVAL;
+
 	/* When you run into this, adjust the code below for the new flag */
 	BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
 
-- 
cgit v1.2.3


From a6cf28e05f0b3bda6ff0c58100324ac91aec6027 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 28 May 2020 21:34:30 +0200
Subject: mac80211: add HE 6 GHz Band Capabilities into parse extension

Handle 6 GHz band capability element parsing for association.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1589399105-25472-4-git-send-email-rmanohar@codeaurora.org
[some renaming to be in line with previous patches]
Link: https://lore.kernel.org/r/20200528213443.a13d7a0b85b0.Ia07584da4fc77aa77c4cc563248d2ce4234ffe5d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h | 1 +
 net/mac80211/util.c        | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b7935f3d000d..dac016636d12 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1494,6 +1494,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_he_operation *he_operation;
 	const struct ieee80211_he_spr *he_spr;
 	const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
+	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
 	const u8 *uora_element;
 	const u8 *mesh_id;
 	const u8 *peering;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 20436c86b9bf..5d2c5ae8aadb 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -936,6 +936,10 @@ static void ieee80211_parse_extension_element(u32 *crc,
 		    len >= ieee80211_he_spr_size(data))
 			elems->he_spr = data;
 		break;
+	case WLAN_EID_EXT_HE_6GHZ_CAPA:
+		if (len == sizeof(*elems->he_6ghz_capa))
+			elems->he_6ghz_capa = data;
+		break;
 	}
 }
 
-- 
cgit v1.2.3


From 223952177296c34d9c8de9cde33204caffe55725 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:31 +0200
Subject: cfg80211: add and expose HE 6 GHz band capabilities

These capabilities cover what would otherwise be transported
in HT/VHT capabilities, but only a subset thereof that is
actually needed on 6 GHz with HE already present. Expose the
capabilities to userspace, drivers are expected to set them
as using the 6 GHz band (currently) requires HE capability.

Link: https://lore.kernel.org/r/20200528213443.244cd5cb9db8.Icd8c773277a88c837e7e3af1d4d1013cc3b66543@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/nl80211.c       | 9 ++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index da734ea71b5a..9b76be3d561a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -354,10 +354,13 @@ struct ieee80211_sta_he_cap {
  *
  * @types_mask: interface types mask
  * @he_cap: holds the HE capabilities
+ * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
+ *	6 GHz band channel (and 0 may be valid value).
  */
 struct ieee80211_sband_iftype_data {
 	u16 types_mask;
 	struct ieee80211_sta_he_cap he_cap;
+	struct ieee80211_he_6ghz_capa he_6ghz_capa;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e42ae429383e..5b350d032fa3 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3565,6 +3565,8 @@ enum nl80211_mpath_info {
  *     defined in HE capabilities IE
  * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently
  *     defined
+ * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16),
+ *	given for all 6 GHz band channels
  * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use
  */
 enum nl80211_band_iftype_attr {
@@ -3575,6 +3577,7 @@ enum nl80211_band_iftype_attr {
 	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
 	NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
 	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+	NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
 
 	/* keep last */
 	__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index bf8bd8268cb7..3a24e6add13e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1562,6 +1562,7 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
 
 static int
 nl80211_send_iftype_data(struct sk_buff *msg,
+			 const struct ieee80211_supported_band *sband,
 			 const struct ieee80211_sband_iftype_data *iftdata)
 {
 	const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
@@ -1585,6 +1586,12 @@ nl80211_send_iftype_data(struct sk_buff *msg,
 			return -ENOBUFS;
 	}
 
+	if (sband->band == NL80211_BAND_6GHZ &&
+	    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
+		    sizeof(iftdata->he_6ghz_capa),
+		    &iftdata->he_6ghz_capa))
+		return -ENOBUFS;
+
 	return 0;
 }
 
@@ -1633,7 +1640,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 			if (!iftdata)
 				return -ENOBUFS;
 
-			err = nl80211_send_iftype_data(msg,
+			err = nl80211_send_iftype_data(msg, sband,
 						       &sband->iftype_data[i]);
 			if (err)
 				return err;
-- 
cgit v1.2.3


From 24a2042cb22fdfc7feef0df9622f0d9d71b8ced1 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 28 May 2020 21:34:32 +0200
Subject: mac80211: add HE 6 GHz Band Capability element

Construct HE 6 GHz band capability element (IEEE 802.11ax/D6.0,
9.4.2.261) for association request and mesh beacon. The 6 GHz
capability information is passed by driver through iftypes caps.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1589399105-25472-7-git-send-email-rmanohar@codeaurora.org
[handle SMPS, adjust for previous patches, reserve SKB space properly,
 change to handle SKB directly]
Link: https://lore.kernel.org/r/20200528213443.643aa8101111.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/mesh.c        |  9 +++++++++
 net/mac80211/mesh.h        |  2 ++
 net/mac80211/mesh_plink.c  |  4 +++-
 net/mac80211/mlme.c        |  3 +++
 net/mac80211/util.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index dac016636d12..344ea828e806 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2177,6 +2177,8 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype);
 u8 *ieee80211_ie_build_he_cap(u8 *pos,
 			      const struct ieee80211_sta_he_cap *he_cap,
 			      u8 *end);
+void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
+				    struct sk_buff *skb);
 u8 *ieee80211_ie_build_he_oper(u8 *pos);
 int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
 			     const struct ieee80211_supported_band *sband,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 5930d07b1e43..5e8d72bdbb98 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -587,6 +587,13 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
+int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata,
+			    struct sk_buff *skb)
+{
+	ieee80211_ie_build_he_6ghz_cap(sdata, skb);
+	return 0;
+}
+
 static void ieee80211_mesh_path_timer(struct timer_list *t)
 {
 	struct ieee80211_sub_if_data *sdata =
@@ -766,6 +773,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
 		   2 + sizeof(struct ieee80211_vht_operation) +
 		   ie_len_he_cap +
 		   2 + 1 + sizeof(struct ieee80211_he_operation) +
+		   2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) +
 		   ifmsh->ie_len;
 
 	bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL);
@@ -885,6 +893,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
 	    mesh_add_vht_oper_ie(sdata, skb) ||
 	    mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) ||
 	    mesh_add_he_oper_ie(sdata, skb) ||
+	    mesh_add_he_6ghz_cap_ie(sdata, skb) ||
 	    mesh_add_vendor_ies(sdata, skb))
 		goto out_free;
 
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 953f720754e8..40492d1bd8fd 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -222,6 +222,8 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata,
 		       struct sk_buff *skb, u8 ie_len);
 int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
 			struct sk_buff *skb);
+int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata,
+			    struct sk_buff *skb);
 void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
 int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_init(void);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 737c5f4dbf52..3aca89c97f36 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 			    2 + sizeof(struct ieee80211_vht_operation) +
 			    ie_len_he_cap +
 			    2 + 1 + sizeof(struct ieee80211_he_operation) +
+			    2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) +
 			    2 + 8 + /* peering IE */
 			    sdata->u.mesh.ie_len);
 	if (!skb)
@@ -328,7 +329,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 		    mesh_add_vht_cap_ie(sdata, skb) ||
 		    mesh_add_vht_oper_ie(sdata, skb) ||
 		    mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) ||
-		    mesh_add_he_oper_ie(sdata, skb))
+		    mesh_add_he_oper_ie(sdata, skb) ||
+		    mesh_add_he_6ghz_cap_ie(sdata, skb))
 			goto free;
 	}
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a259b4487b60..f6ddce646f18 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -658,6 +658,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
 				      he_cap->he_cap_elem.phy_cap_info);
 	pos = skb_put(skb, he_cap_size);
 	ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size);
+
+	ieee80211_ie_build_he_6ghz_cap(sdata, skb);
 }
 
 static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
@@ -731,6 +733,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 			2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */
 				sizeof(struct ieee80211_he_mcs_nss_supp) +
 				IEEE80211_HE_PPE_THRES_MAX_LEN +
+			2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) +
 			assoc_data->ie_len + /* extra IEs */
 			(assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
 			9, /* WMM */
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5d2c5ae8aadb..048b38546a56 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2839,6 +2839,52 @@ end:
 	return pos;
 }
 
+void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
+				    struct sk_buff *skb)
+{
+	struct ieee80211_supported_band *sband;
+	const struct ieee80211_sband_iftype_data *iftd;
+	enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif);
+	u8 *pos;
+	u16 cap;
+
+	sband = ieee80211_get_sband(sdata);
+	if (!sband)
+		return;
+
+	iftd = ieee80211_get_sband_iftype_data(sband, iftype);
+	if (WARN_ON(!iftd))
+		return;
+
+	cap = le16_to_cpu(iftd->he_6ghz_capa.capa);
+	cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS;
+
+	switch (sdata->smps_mode) {
+	case IEEE80211_SMPS_AUTOMATIC:
+	case IEEE80211_SMPS_NUM_MODES:
+		WARN_ON(1);
+		/* fall through */
+	case IEEE80211_SMPS_OFF:
+		cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED,
+				       IEEE80211_HE_6GHZ_CAP_SM_PS);
+		break;
+	case IEEE80211_SMPS_STATIC:
+		cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC,
+				       IEEE80211_HE_6GHZ_CAP_SM_PS);
+		break;
+	case IEEE80211_SMPS_DYNAMIC:
+		cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC,
+				       IEEE80211_HE_6GHZ_CAP_SM_PS);
+		break;
+	}
+
+	pos = skb_put(skb, 2 + 1 + sizeof(cap));
+	*pos++ = WLAN_EID_EXTENSION;
+	*pos++ = 1 + sizeof(cap);
+	*pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA;
+	put_unaligned_le16(cap, pos);
+}
+
 u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
 			       const struct cfg80211_chan_def *chandef,
 			       u16 prot_mode, bool rifs_mode)
-- 
cgit v1.2.3


From d1b7524b3ea140e552658485d4e9dce5ee2953e1 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 28 May 2020 21:34:33 +0200
Subject: mac80211: build HE operation with 6 GHz oper information

Add 6 GHz operation information (IEEE 802.11ax/D6.0, Figure 9-787k)
while building HE operation element for non-HE AP. This field is used to
determine channel information in the absence of HT/VHT IEs.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1589399105-25472-8-git-send-email-rmanohar@codeaurora.org
[fix skb allocation size]
Link: https://lore.kernel.org/r/20200528193455.76796-1-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/mesh.c        | 12 ++++++---
 net/mac80211/mesh_plink.c  |  1 +
 net/mac80211/util.c        | 65 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 72 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 344ea828e806..9f874ce500f6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2179,7 +2179,7 @@ u8 *ieee80211_ie_build_he_cap(u8 *pos,
 			      u8 *end);
 void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
 				    struct sk_buff *skb);
-u8 *ieee80211_ie_build_he_oper(u8 *pos);
+u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef);
 int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
 			     const struct ieee80211_supported_band *sband,
 			     const u8 *srates, int srates_len, u32 *rates);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 5e8d72bdbb98..5f3d45474db6 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -565,6 +565,7 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
 {
 	const struct ieee80211_sta_he_cap *he_cap;
 	struct ieee80211_supported_band *sband;
+	u32 len;
 	u8 *pos;
 
 	sband = ieee80211_get_sband(sdata);
@@ -578,11 +579,15 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
-	if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation))
+	len = 2 + 1 + sizeof(struct ieee80211_he_operation);
+	if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ)
+		len += sizeof(struct ieee80211_he_6ghz_oper);
+
+	if (skb_tailroom(skb) < len)
 		return -ENOMEM;
 
-	pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation));
-	ieee80211_ie_build_he_oper(pos);
+	pos = skb_put(skb, len);
+	ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef);
 
 	return 0;
 }
@@ -773,6 +778,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
 		   2 + sizeof(struct ieee80211_vht_operation) +
 		   ie_len_he_cap +
 		   2 + 1 + sizeof(struct ieee80211_he_operation) +
+			   sizeof(struct ieee80211_he_6ghz_oper) +
 		   2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) +
 		   ifmsh->ie_len;
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 3aca89c97f36..fbbfc5d4a51c 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -238,6 +238,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 			    2 + sizeof(struct ieee80211_vht_operation) +
 			    ie_len_he_cap +
 			    2 + 1 + sizeof(struct ieee80211_he_operation) +
+				    sizeof(struct ieee80211_he_6ghz_oper) +
 			    2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) +
 			    2 + 8 + /* peering IE */
 			    sdata->u.mesh.ie_len);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 048b38546a56..87dd003dbdf2 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3008,13 +3008,18 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 	return pos + sizeof(struct ieee80211_vht_operation);
 }
 
-u8 *ieee80211_ie_build_he_oper(u8 *pos)
+u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_he_operation *he_oper;
+	struct ieee80211_he_6ghz_oper *he_6ghz_op;
 	u32 he_oper_params;
+	u8 ie_len = 1 + sizeof(struct ieee80211_he_operation);
+
+	if (chandef->chan->band == NL80211_BAND_6GHZ)
+		ie_len += sizeof(struct ieee80211_he_6ghz_oper);
 
 	*pos++ = WLAN_EID_EXTENSION;
-	*pos++ = 1 + sizeof(struct ieee80211_he_operation);
+	*pos++ = ie_len;
 	*pos++ = WLAN_EID_EXT_HE_OPERATION;
 
 	he_oper_params = 0;
@@ -3024,16 +3029,68 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos)
 				IEEE80211_HE_OPERATION_ER_SU_DISABLE);
 	he_oper_params |= u32_encode_bits(1,
 				IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED);
+	if (chandef->chan->band == NL80211_BAND_6GHZ)
+		he_oper_params |= u32_encode_bits(1,
+				IEEE80211_HE_OPERATION_6GHZ_OP_INFO);
 
 	he_oper = (struct ieee80211_he_operation *)pos;
 	he_oper->he_oper_params = cpu_to_le32(he_oper_params);
 
 	/* don't require special HE peer rates */
 	he_oper->he_mcs_nss_set = cpu_to_le16(0xffff);
+	pos += sizeof(struct ieee80211_he_operation);
 
-	/* TODO add VHT operational and 6GHz operational subelement? */
+	if (chandef->chan->band != NL80211_BAND_6GHZ)
+		goto out;
 
-	return pos + sizeof(struct ieee80211_vht_operation);
+	/* TODO add VHT operational */
+	he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos;
+	he_6ghz_op->minrate = 6; /* 6 Mbps */
+	he_6ghz_op->primary =
+		ieee80211_frequency_to_channel(chandef->chan->center_freq);
+	he_6ghz_op->ccfs0 =
+		ieee80211_frequency_to_channel(chandef->center_freq1);
+	if (chandef->center_freq2)
+		he_6ghz_op->ccfs1 =
+			ieee80211_frequency_to_channel(chandef->center_freq2);
+	else
+		he_6ghz_op->ccfs1 = 0;
+
+	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_160:
+		/* Convert 160 MHz channel width to new style as interop
+		 * workaround.
+		 */
+		he_6ghz_op->control =
+			IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
+		he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0;
+		if (chandef->chan->center_freq < chandef->center_freq1)
+			he_6ghz_op->ccfs0 -= 8;
+		else
+			he_6ghz_op->ccfs0 += 8;
+		fallthrough;
+	case NL80211_CHAN_WIDTH_80P80:
+		he_6ghz_op->control =
+			IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
+		break;
+	case NL80211_CHAN_WIDTH_80:
+		he_6ghz_op->control =
+			IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ;
+		break;
+	case NL80211_CHAN_WIDTH_40:
+		he_6ghz_op->control =
+			IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ;
+		break;
+	default:
+		he_6ghz_op->control =
+			IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ;
+		break;
+	}
+
+	pos += sizeof(struct ieee80211_he_6ghz_oper);
+
+out:
+	return pos;
 }
 
 bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
-- 
cgit v1.2.3


From 607ca9ea3462719e256b60b24286f984e0d48c9b Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 28 May 2020 21:34:34 +0200
Subject: mac80211: do not allow HT/VHT IEs in 6 GHz mesh mode

As HT/VHT elements are not allowed in 6 GHz band, do not include
them in mesh beacon template formation.

Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/1589399105-25472-9-git-send-email-rmanohar@codeaurora.org
Link: https://lore.kernel.org/r/20200528193455.76796-2-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 5f3d45474db6..79e0a90982dd 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -415,6 +415,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
 	if (!sband)
 		return -EINVAL;
 
+	/* HT not allowed in 6 GHz */
+	if (sband->band == NL80211_BAND_6GHZ)
+		return 0;
+
 	if (!sband->ht_cap.ht_supported ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
@@ -452,6 +456,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
 	sband = local->hw.wiphy->bands[channel->band];
 	ht_cap = &sband->ht_cap;
 
+	/* HT not allowed in 6 GHz */
+	if (sband->band == NL80211_BAND_6GHZ)
+		return 0;
+
 	if (!ht_cap->ht_supported ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
@@ -479,6 +487,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
 	if (!sband)
 		return -EINVAL;
 
+	/* VHT not allowed in 6 GHz */
+	if (sband->band == NL80211_BAND_6GHZ)
+		return 0;
+
 	if (!sband->vht_cap.vht_supported ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
@@ -516,6 +528,10 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
 	sband = local->hw.wiphy->bands[channel->band];
 	vht_cap = &sband->vht_cap;
 
+	/* VHT not allowed in 6 GHz */
+	if (sband->band == NL80211_BAND_6GHZ)
+		return 0;
+
 	if (!vht_cap->vht_supported ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
 	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-- 
cgit v1.2.3


From 2a333a0db24e37daa2e4eb9a542c07deda44ca5a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:35 +0200
Subject: mac80211: avoid using ext NSS high BW if not supported

If the AP advertises inconsistent data, namely it has CCFS1 or CCFS2,
but doesn't advertise support for 160/80+80 bandwidth or "Extended NSS
BW Support", then we cannot use any MCSes in the the higher bandwidth.
Thus, avoid connecting with higher bandwidth since it's less efficient
that way.

Link: https://lore.kernel.org/r/20200528213443.0e55d40c3ccc.I6fd0b4708ebd087e5e46466c3e91f6efbcbef668@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c        | 11 +++++++++--
 net/mac80211/ieee80211_i.h |  6 +++++-
 net/mac80211/mesh.c        | 16 ++++++++++++---
 net/mac80211/mlme.c        | 25 +++++++++++++++++------
 net/mac80211/scan.c        |  6 ++++++
 net/mac80211/spectmgmt.c   |  4 +++-
 net/mac80211/util.c        | 49 ++++++++++++++++++++++++++++++++++++++++++----
 7 files changed, 100 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 2479cd48fed0..81d26fef41e9 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -9,7 +9,7 @@
  * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright(c) 2018-2019 Intel Corporation
+ * Copyright(c) 2018-2020 Intel Corporation
  */
 
 #include <linux/delay.h>
@@ -781,6 +781,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	enum nl80211_channel_type ch_type;
 	int err;
 	u32 sta_flags;
+	u32 vht_cap_info = 0;
 
 	sdata_assert_lock(sdata);
 
@@ -798,9 +799,13 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
+	if (elems->vht_cap_elem)
+		vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info);
+
 	memset(&params, 0, sizeof(params));
 	err = ieee80211_parse_ch_switch_ie(sdata, elems,
 					   ifibss->chandef.chan->band,
+					   vht_cap_info,
 					   sta_flags, ifibss->bssid, &csa_ie);
 	/* can't switch to destination channel, fail */
 	if (err < 0)
@@ -1060,8 +1065,10 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
 			/* we both use VHT */
 			struct ieee80211_vht_cap cap_ie;
 			struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
+			u32 vht_cap_info =
+				le32_to_cpu(elems->vht_cap_elem->vht_cap_info);
 
-			ieee80211_chandef_vht_oper(&local->hw,
+			ieee80211_chandef_vht_oper(&local->hw, vht_cap_info,
 						   elems->vht_operation,
 						   elems->ht_operation,
 						   &chandef);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9f874ce500f6..0cc584574976 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -111,6 +111,8 @@ struct ieee80211_bss {
 	size_t supp_rates_len;
 	struct ieee80211_rate *beacon_rate;
 
+	u32 vht_cap_info;
+
 	/*
 	 * During association, we save an ERP value from a probe response so
 	 * that we can feed ERP info to the driver when handling the
@@ -1915,6 +1917,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
  * @sdata: the sdata of the interface which has received the frame
  * @elems: parsed 802.11 elements received with the frame
  * @current_band: indicates the current band
+ * @vht_cap_info: VHT capabilities of the transmitter
  * @sta_flags: contains information about own capabilities and restrictions
  *	to decide which channel switch announcements can be accepted. Only the
  *	following subset of &enum ieee80211_sta_flags are evaluated:
@@ -1929,6 +1932,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
 int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 				 struct ieee802_11_elems *elems,
 				 enum nl80211_band current_band,
+				 u32 vht_cap_info,
 				 u32 sta_flags, u8 *bssid,
 				 struct ieee80211_csa_ie *csa_ie);
 
@@ -2194,7 +2198,7 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
 /* channel management */
 bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
 			       struct cfg80211_chan_def *chandef);
-bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
+bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 79e0a90982dd..696d6fb322e6 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008, 2009 open80211s Ltd.
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
  * Authors:    Luis Carlos Cobo <luisca@cozybit.com>
  * 	       Javier Cardona <javier@cozybit.com>
  */
@@ -63,6 +63,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	u32 basic_rates = 0;
 	struct cfg80211_chan_def sta_chan_def;
 	struct ieee80211_supported_band *sband;
+	u32 vht_cap_info = 0;
 
 	/*
 	 * As support for each feature is added, check for matching
@@ -96,7 +97,11 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
 				NL80211_CHAN_NO_HT);
 	ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
-	ieee80211_chandef_vht_oper(&sdata->local->hw,
+
+	if (ie->vht_cap_elem)
+		vht_cap_info = le32_to_cpu(ie->vht_cap_elem->vht_cap_info);
+
+	ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info,
 				   ie->vht_operation, ie->ht_operation,
 				   &sta_chan_def);
 
@@ -1076,7 +1081,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct ieee80211_supported_band *sband;
 	int err;
-	u32 sta_flags;
+	u32 sta_flags, vht_cap_info = 0;
 
 	sdata_assert_lock(sdata);
 
@@ -1099,8 +1104,13 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
+	if (elems->vht_cap_elem)
+		vht_cap_info =
+			le32_to_cpu(elems->vht_cap_elem->vht_cap_info);
+
 	memset(&params, 0, sizeof(params));
 	err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band,
+					   vht_cap_info,
 					   sta_flags, sdata->vif.addr,
 					   &csa_ie);
 	if (err < 0)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f6ddce646f18..1f9c48414c85 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -145,6 +145,7 @@ static u32
 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 			     struct ieee80211_supported_band *sband,
 			     struct ieee80211_channel *channel,
+			     u32 vht_cap_info,
 			     const struct ieee80211_ht_operation *ht_oper,
 			     const struct ieee80211_vht_operation *vht_oper,
 			     const struct ieee80211_he_operation *he_oper,
@@ -223,7 +224,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 		memcpy(&he_oper_vht_cap, he_oper->optional, 3);
 		he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0);
 
-		if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
+		if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info,
 						&he_oper_vht_cap, ht_oper,
 						&vht_chandef)) {
 			if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
@@ -232,8 +233,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 			ret = IEEE80211_STA_DISABLE_HE;
 			goto out;
 		}
-	} else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper,
-					       ht_oper, &vht_chandef)) {
+	} else if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
+					       vht_cap_info,
+					       vht_oper, ht_oper,
+					       &vht_chandef)) {
 		if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
 			sdata_info(sdata,
 				   "AP VHT information is invalid, disable VHT\n");
@@ -329,6 +332,7 @@ out:
 static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 			       struct sta_info *sta,
 			       const struct ieee80211_ht_cap *ht_cap,
+			       const struct ieee80211_vht_cap *vht_cap,
 			       const struct ieee80211_ht_operation *ht_oper,
 			       const struct ieee80211_vht_operation *vht_oper,
 			       const struct ieee80211_he_operation *he_oper,
@@ -343,6 +347,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 	u16 ht_opmode;
 	u32 flags;
 	enum ieee80211_sta_rx_bandwidth new_sta_bw;
+	u32 vht_cap_info = 0;
 	int ret;
 
 	/* if HT was/is disabled, don't track any bandwidth changes */
@@ -371,8 +376,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
 		sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
 	}
 
+	if (vht_cap)
+		vht_cap_info = le32_to_cpu(vht_cap->vht_cap_info);
+
 	/* calculate new channel (type) based on HT/VHT/HE operation IEs */
-	flags = ieee80211_determine_chantype(sdata, sband, chan,
+	flags = ieee80211_determine_chantype(sdata, sband, chan, vht_cap_info,
 					     ht_oper, vht_oper, he_oper,
 					     &chandef, true);
 
@@ -1327,6 +1335,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	enum nl80211_band current_band;
 	struct ieee80211_csa_ie csa_ie;
 	struct ieee80211_channel_switch ch_switch;
+	struct ieee80211_bss *bss;
 	int res;
 
 	sdata_assert_lock(sdata);
@@ -1338,7 +1347,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	current_band = cbss->channel->band;
+	bss = (void *)cbss->priv;
 	res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band,
+					   bss->vht_cap_info,
 					   ifmgd->flags,
 					   ifmgd->associated->bssid, &csa_ie);
 
@@ -4097,8 +4108,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
 
-	if (ieee80211_config_bw(sdata, sta,
-				elems.ht_cap_elem, elems.ht_operation,
+	if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem,
+				elems.vht_cap_elem, elems.ht_operation,
 				elems.vht_operation, elems.he_operation,
 				bssid, &changed)) {
 		mutex_unlock(&local->sta_mtx);
@@ -4815,6 +4826,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 	const struct ieee80211_he_operation *he_oper = NULL;
 	struct ieee80211_supported_band *sband;
 	struct cfg80211_chan_def chandef;
+	struct ieee80211_bss *bss = (void *)cbss->priv;
 	int ret;
 	u32 i;
 	bool have_80mhz;
@@ -4913,6 +4925,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 
 	ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
 						     cbss->channel,
+						     bss->vht_cap_info,
 						     ht_oper, vht_oper, he_oper,
 						     &chandef, false);
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 5db15996524f..d0c2e8012118 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -132,6 +132,12 @@ ieee80211_update_bss_from_elems(struct ieee80211_local *local,
 			bss->beacon_rate =
 				&sband->bitrates[rx_status->rate_idx];
 	}
+
+	if (elems->vht_cap_elem)
+		bss->vht_cap_info =
+			le32_to_cpu(elems->vht_cap_elem->vht_cap_info);
+	else
+		bss->vht_cap_info = 0;
 }
 
 struct ieee80211_bss *
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 5fe2b645912f..ae1cb2c68722 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2008, Intel Corporation
  * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2018        Intel Corporation
+ * Copyright (C) 2018, 2020 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -22,6 +22,7 @@
 int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 				 struct ieee802_11_elems *elems,
 				 enum nl80211_band current_band,
+				 u32 vht_cap_info,
 				 u32 sta_flags, u8 *bssid,
 				 struct ieee80211_csa_ie *csa_ie)
 {
@@ -150,6 +151,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 
 		/* ignore if parsing fails */
 		if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
+						vht_cap_info,
 						&vht_oper, &ht_oper,
 						&new_vht_chandef))
 			new_vht_chandef.chan = NULL;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 87dd003dbdf2..e6104829fa1c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3120,7 +3120,7 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
 	return true;
 }
 
-bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
+bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef)
@@ -3132,6 +3132,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
 	u32 vht_cap;
 	bool support_80_80 = false;
 	bool support_160 = false;
+	u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info,
+					  IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
+	u8 supp_chwidth = u32_get_bits(vht_cap_info,
+				       IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);
 
 	if (!oper || !htop)
 		return false;
@@ -3151,11 +3155,48 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
 				IEEE80211_HT_OP_MODE_CCFS2_MASK)
 			>> IEEE80211_HT_OP_MODE_CCFS2_SHIFT;
 
-	/* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */
 	ccf0 = ccfs0;
-	ccf1 = ccfs1;
-	if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW))
+
+	/* if not supported, parse as though we didn't understand it */
+	if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW))
+		ext_nss_bw_supp = 0;
+
+	/*
+	 * Cf. IEEE 802.11 Table 9-250
+	 *
+	 * We really just consider that because it's inefficient to connect
+	 * at a higher bandwidth than we'll actually be able to use.
+	 */
+	switch ((supp_chwidth << 4) | ext_nss_bw_supp) {
+	default:
+	case 0x00:
+		ccf1 = 0;
+		support_160 = false;
+		support_80_80 = false;
+		break;
+	case 0x01:
+		support_80_80 = false;
+		/* fall through */
+	case 0x02:
+	case 0x03:
 		ccf1 = ccfs2;
+		break;
+	case 0x10:
+		ccf1 = ccfs1;
+		break;
+	case 0x11:
+	case 0x12:
+		if (!ccfs1)
+			ccf1 = ccfs2;
+		else
+			ccf1 = ccfs1;
+		break;
+	case 0x13:
+	case 0x20:
+	case 0x23:
+		ccf1 = ccfs1;
+		break;
+	}
 
 	cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band);
 	cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band);
-- 
cgit v1.2.3


From 57fa5e85d53ce51e0cb06a7f320b79377d0fbe5f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:36 +0200
Subject: mac80211: determine chandef from HE 6 GHz operation

Support connecting to HE 6 GHz APs and mesh networks on 6 GHz,
where the HT/VHT information is missing but instead the HE 6 GHz
band capability is present, and the 6 GHz Operation information
field is used to encode the channel configuration instead of the
HT/VHT operation elements.

Also add some other bits needed to connect to 6 GHz networks.

Link: https://lore.kernel.org/r/1589399105-25472-10-git-send-email-rmanohar@codeaurora.org
Co-developed-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/20200528213443.25687d2695bc.I3f9747c1147480f65445f13eda5c4a5ed4e86757@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |   3 ++
 net/mac80211/mesh.c        |   1 +
 net/mac80211/mlme.c        |  69 +++++++++++++++++++++--------
 net/mac80211/util.c        | 106 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 0cc584574976..6cac5bf7cba3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2202,6 +2202,9 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
+				    const struct ieee80211_he_operation *he_oper,
+				    struct cfg80211_chan_def *chandef);
 u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
 
 int __must_check
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 696d6fb322e6..5f1ca25b6c97 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -104,6 +104,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info,
 				   ie->vht_operation, ie->ht_operation,
 				   &sta_chan_def);
+	ieee80211_chandef_he_6ghz_oper(sdata, ie->he_operation, &sta_chan_def);
 
 	if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
 					 &sta_chan_def))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1f9c48414c85..bc558d1d20fc 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -156,15 +156,24 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_sta_ht_cap sta_ht_cap;
 	u32 ht_cfreq, ret;
 
-	memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
-	ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
-
 	memset(chandef, 0, sizeof(struct cfg80211_chan_def));
 	chandef->chan = channel;
 	chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
 	chandef->center_freq1 = channel->center_freq;
 	chandef->freq1_offset = channel->freq_offset;
 
+	if (channel->band == NL80211_BAND_6GHZ) {
+		if (!ieee80211_chandef_he_6ghz_oper(sdata, he_oper, chandef))
+			ret = IEEE80211_STA_DISABLE_HT |
+			      IEEE80211_STA_DISABLE_VHT |
+			      IEEE80211_STA_DISABLE_HE;
+		vht_chandef = *chandef;
+		goto out;
+	}
+
+	memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
+	ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+
 	if (!ht_oper || !sta_ht_cap.ht_supported) {
 		ret = IEEE80211_STA_DISABLE_HT |
 		      IEEE80211_STA_DISABLE_VHT |
@@ -914,7 +923,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 			 !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)))
 		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
 
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+	if (sband->band != NL80211_BAND_6GHZ &&
+	    !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
 		ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param,
 				    sband, chan, sdata->smps_mode);
 
@@ -968,7 +978,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		offset = noffset;
 	}
 
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+	if (sband->band != NL80211_BAND_6GHZ &&
+	    !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
 		ieee80211_add_vht_ie(sdata, skb, sband,
 				     &assoc_data->ap_vht_cap);
 
@@ -3248,6 +3259,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 	const struct cfg80211_bss_ies *bss_ies = NULL;
 	struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
+	bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
 	u32 changed = 0;
 	int err;
 	bool ret;
@@ -3289,11 +3301,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	 * 2G/3G/4G wifi routers, reported models include the "Onda PN51T",
 	 * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device.
 	 */
-	if ((assoc_data->wmm && !elems->wmm_param) ||
-	    (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
-	     (!elems->ht_cap_elem || !elems->ht_operation)) ||
-	    (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
-	     (!elems->vht_cap_elem || !elems->vht_operation))) {
+	if (!is_6ghz &&
+	    ((assoc_data->wmm && !elems->wmm_param) ||
+	     (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+	      (!elems->ht_cap_elem || !elems->ht_operation)) ||
+	     (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+	      (!elems->vht_cap_elem || !elems->vht_operation)))) {
 		const struct cfg80211_bss_ies *ies;
 		struct ieee802_11_elems bss_elems;
 
@@ -3351,7 +3364,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	 * We previously checked these in the beacon/probe response, so
 	 * they should be present here. This is just a safety net.
 	 */
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+	if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
 	    (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) {
 		sdata_info(sdata,
 			   "HT AP is missing WMM params or HT capability/operation\n");
@@ -3359,7 +3372,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+	if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
 	    (!elems->vht_cap_elem || !elems->vht_operation)) {
 		sdata_info(sdata,
 			   "VHT AP is missing VHT capability/operation\n");
@@ -3367,6 +3380,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		goto out;
 	}
 
+	if (is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+	    !elems->he_6ghz_capa) {
+		sdata_info(sdata,
+			   "HE 6 GHz AP is missing HE 6 GHz band capability\n");
+		ret = false;
+		goto out;
+	}
+
 	mutex_lock(&sdata->local->sta_mtx);
 	/*
 	 * station info was already allocated and inserted before
@@ -4826,6 +4847,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 	const struct ieee80211_he_operation *he_oper = NULL;
 	struct ieee80211_supported_band *sband;
 	struct cfg80211_chan_def chandef;
+	bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
 	struct ieee80211_bss *bss = (void *)cbss->priv;
 	int ret;
 	u32 i;
@@ -4838,21 +4860,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 			  IEEE80211_STA_DISABLE_160MHZ);
 
 	/* disable HT/VHT/HE if we don't support them */
-	if (!sband->ht_cap.ht_supported) {
+	if (!sband->ht_cap.ht_supported && !is_6ghz) {
 		ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
 		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
 		ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
 	}
 
-	if (!sband->vht_cap.vht_supported)
+	if (!sband->vht_cap.vht_supported && !is_6ghz) {
 		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+		ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+	}
 
 	if (!ieee80211_get_he_sta_cap(sband))
 		ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
 
 	rcu_read_lock();
 
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && !is_6ghz) {
 		const u8 *ht_oper_ie, *ht_cap_ie;
 
 		ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION);
@@ -4869,7 +4893,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
+	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && !is_6ghz) {
 		const u8 *vht_oper_ie, *vht_cap;
 
 		vht_oper_ie = ieee80211_bss_get_ie(cbss,
@@ -4934,6 +4958,11 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_unlock();
 
+	if (ifmgd->flags & IEEE80211_STA_DISABLE_HE && is_6ghz) {
+		sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection");
+		return -EINVAL;
+	}
+
 	/* will change later if needed */
 	sdata->smps_mode = IEEE80211_SMPS_OFF;
 
@@ -5315,6 +5344,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 			struct cfg80211_assoc_request *req)
 {
+	bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_bss *bss = (void *)req->bss->priv;
@@ -5457,14 +5487,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation))
 		assoc_data->ap_ht_param =
 			((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param;
-	else
+	else if (!is_6ghz)
 		ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
 	vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY);
 	if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap))
 		memcpy(&assoc_data->ap_vht_cap, vht_ie + 2,
 		       sizeof(struct ieee80211_vht_cap));
-	else
-		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+	else if (!is_6ghz)
+		ifmgd->flags |= IEEE80211_STA_DISABLE_VHT |
+				IEEE80211_STA_DISABLE_HE;
 	rcu_read_unlock();
 
 	if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) &&
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e6104829fa1c..cbe24d303f0d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3244,6 +3244,112 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 	return true;
 }
 
+bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
+				    const struct ieee80211_he_operation *he_oper,
+				    struct cfg80211_chan_def *chandef)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif);
+	const struct ieee80211_sta_he_cap *he_cap;
+	struct cfg80211_chan_def he_chandef = *chandef;
+	const struct ieee80211_he_6ghz_oper *he_6ghz_oper;
+	bool support_80_80, support_160;
+	u8 he_phy_cap;
+	u32 freq;
+
+	if (chandef->chan->band != NL80211_BAND_6GHZ)
+		return true;
+
+	sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ];
+
+	he_cap = ieee80211_get_he_iftype_cap(sband, iftype);
+	if (!he_cap) {
+		sdata_info(sdata, "Missing iftype sband data/HE cap");
+		return false;
+	}
+
+	he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0];
+	support_160 =
+		he_phy_cap &
+		IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G;
+	support_80_80 =
+		he_phy_cap &
+		IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G;
+
+	if (!he_oper) {
+		sdata_info(sdata,
+			   "HE is not advertised on (on %d MHz), expect issues\n",
+			   chandef->chan->center_freq);
+		return false;
+	}
+
+	he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper);
+
+	if (!he_6ghz_oper) {
+		sdata_info(sdata,
+			   "HE 6GHz operation missing (on %d MHz), expect issues\n",
+			   chandef->chan->center_freq);
+		return false;
+	}
+
+	freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary,
+					      NL80211_BAND_6GHZ);
+	he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq);
+
+	switch (u8_get_bits(he_6ghz_oper->control,
+			    IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) {
+	case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ:
+		he_chandef.width = NL80211_CHAN_WIDTH_20;
+		break;
+	case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ:
+		he_chandef.width = NL80211_CHAN_WIDTH_40;
+		break;
+	case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ:
+		he_chandef.width = NL80211_CHAN_WIDTH_80;
+		break;
+	case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ:
+		he_chandef.width = NL80211_CHAN_WIDTH_80;
+		if (!he_6ghz_oper->ccfs1)
+			break;
+		if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) {
+			if (support_160)
+				he_chandef.width = NL80211_CHAN_WIDTH_160;
+		} else {
+			if (support_80_80)
+				he_chandef.width = NL80211_CHAN_WIDTH_80P80;
+		}
+		break;
+	}
+
+	if (he_chandef.width == NL80211_CHAN_WIDTH_160) {
+		he_chandef.center_freq1 =
+			ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1,
+						       NL80211_BAND_6GHZ);
+	} else {
+		he_chandef.center_freq1 =
+			ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0,
+						       NL80211_BAND_6GHZ);
+		he_chandef.center_freq2 =
+			ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1,
+						       NL80211_BAND_6GHZ);
+	}
+
+	if (!cfg80211_chandef_valid(&he_chandef)) {
+		sdata_info(sdata,
+			   "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n",
+			   he_chandef.chan ? he_chandef.chan->center_freq : 0,
+			   he_chandef.width,
+			   he_chandef.center_freq1,
+			   he_chandef.center_freq2);
+		return false;
+	}
+
+	*chandef = he_chandef;
+
+	return true;
+}
+
 int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
 			     const struct ieee80211_supported_band *sband,
 			     const u8 *srates, int srates_len, u32 *rates)
-- 
cgit v1.2.3


From 3b3ec3d52e8f72ec8c40477b96f23440a89000be Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Thu, 28 May 2020 21:34:37 +0200
Subject: mac80211: check the correct bit for EMA AP

An AP supporting EMA (Enhanced Multi-BSSID advertisement) should set
bit 83 in the extended capabilities IE (9.4.2.26 in the 802.11ax D5 spec).
So the *3rd* bit of the 10th byte should be checked.
Also, in one place, the wrong byte was checked.
(cfg80211_find_ie returns a pointer to the beginning of the IE,
 so the data really starts at ie[2], so the 10th byte
 should be ie[12]. To avoid this confusion, use cfg80211_find_elem
 instead).

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Link: https://lore.kernel.org/r/20200528213443.4316121fa2a3.I9745582f8d41ad8e689dac0fefcd70b276d7c1ea@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  2 +-
 net/mac80211/mlme.c       | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 9580dfd9e2d1..1ecfd19f836d 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3082,7 +3082,7 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7)
 
 /* Defines support for enhanced multi-bssid advertisement*/
-#define WLAN_EXT_CAPA11_EMA_SUPPORT	BIT(1)
+#define WLAN_EXT_CAPA11_EMA_SUPPORT	BIT(3)
 
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index bc558d1d20fc..c534cd1bb9cd 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5596,7 +5596,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 		assoc_data->timeout_started = true;
 		assoc_data->need_beacon = true;
 	} else if (beacon_ies) {
-		const u8 *ie;
+		const struct element *elem;
 		u8 dtim_count = 0;
 
 		ieee80211_get_dtim(beacon_ies, &dtim_count,
@@ -5613,15 +5613,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 			sdata->vif.bss_conf.sync_dtim_count = dtim_count;
 		}
 
-		ie = cfg80211_find_ext_ie(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION,
-					  beacon_ies->data, beacon_ies->len);
-		if (ie && ie[1] >= 3)
-			sdata->vif.bss_conf.profile_periodicity = ie[4];
+		elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION,
+					      beacon_ies->data, beacon_ies->len);
+		if (elem && elem->datalen >= 3)
+			sdata->vif.bss_conf.profile_periodicity = elem->data[2];
 
-		ie = cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY,
-				      beacon_ies->data, beacon_ies->len);
-		if (ie && ie[1] >= 11 &&
-		    (ie[10] & WLAN_EXT_CAPA11_EMA_SUPPORT))
+		elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY,
+					  beacon_ies->data, beacon_ies->len);
+		if (elem && elem->datalen >= 11 &&
+		    (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT))
 			sdata->vif.bss_conf.ema_ap = true;
 	} else {
 		assoc_data->timeout = jiffies;
-- 
cgit v1.2.3


From 1bb9a8a4c81d0305c511a0919cd30ebfa91915ae Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:38 +0200
Subject: mac80211: use HE 6 GHz band capability and pass it to the driver

In order to handle 6 GHz AP side, take the HE 6 GHz band capability
data and pass it to the driver (which needs it for A-MPDU spacing
and A-MPDU length).

Link: https://lore.kernel.org/r/1589399105-25472-6-git-send-email-rmanohar@codeaurora.org
Co-developed-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Link: https://lore.kernel.org/r/20200528213443.784e4890d82f.I5f1230d5ab27e84e7bbe88e3645b24ea15a0c146@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  4 +++-
 net/mac80211/cfg.c         |  4 +++-
 net/mac80211/he.c          | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/mesh_plink.c  |  4 +++-
 net/mac80211/mlme.c        |  1 +
 6 files changed, 59 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7cb712427df1..11d5610d2ad5 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7,7 +7,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
  */
 
 #ifndef MAC80211_H
@@ -1977,6 +1977,7 @@ struct ieee80211_sta_txpwr {
  * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
  * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
  * @he_cap: HE capabilities of this STA
+ * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
  * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
  *	that this station is allowed to transmit to us.
  *	Can be modified by driver.
@@ -2016,6 +2017,7 @@ struct ieee80211_sta {
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
 	struct ieee80211_sta_he_cap he_cap;
+	struct ieee80211_he_6ghz_capa he_6ghz_capa;
 	u16 max_rx_aggregation_subframes;
 	bool wme;
 	u8 uapsd_queues;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 06a2b7640a9d..90a07d075fdb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1520,7 +1520,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 	if (params->he_capa)
 		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
 						  (void *)params->he_capa,
-						  params->he_capa_len, sta);
+						  params->he_capa_len,
+						  (void *)params->he_6ghz_capa,
+						  sta);
 
 	if (params->opmode_notif_used) {
 		/* returned value is only needed for rc update, but the
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index f520552b22be..cc26f239838b 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -8,10 +8,55 @@
 
 #include "ieee80211_i.h"
 
+static void
+ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
+				   struct sta_info *sta)
+{
+	enum ieee80211_smps_mode smps_mode;
+
+	if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+	    sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+		switch (le16_get_bits(he_6ghz_capa->capa,
+				      IEEE80211_HE_6GHZ_CAP_SM_PS)) {
+		case WLAN_HT_CAP_SM_PS_INVALID:
+		case WLAN_HT_CAP_SM_PS_STATIC:
+			smps_mode = IEEE80211_SMPS_STATIC;
+			break;
+		case WLAN_HT_CAP_SM_PS_DYNAMIC:
+			smps_mode = IEEE80211_SMPS_DYNAMIC;
+			break;
+		case WLAN_HT_CAP_SM_PS_DISABLED:
+			smps_mode = IEEE80211_SMPS_OFF;
+			break;
+		}
+
+		sta->sta.smps_mode = smps_mode;
+	} else {
+		sta->sta.smps_mode = IEEE80211_SMPS_OFF;
+	}
+
+	switch (le16_get_bits(he_6ghz_capa->capa,
+			      IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) {
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
+		break;
+	case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
+	default:
+		sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
+		break;
+	}
+
+	sta->sta.he_6ghz_capa = *he_6ghz_capa;
+}
+
 void
 ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_supported_band *sband,
 				  const u8 *he_cap_ie, u8 he_cap_len,
+				  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
 				  struct sta_info *sta)
 {
 	struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
@@ -53,6 +98,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 
 	sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta);
 	sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+
+	if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa)
+		ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta);
 }
 
 void
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6cac5bf7cba3..24dc1fd57000 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1899,6 +1899,7 @@ void
 ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_supported_band *sband,
 				  const u8 *he_cap_ie, u8 he_cap_len,
+				  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
 				  struct sta_info *sta);
 void
 ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index fbbfc5d4a51c..798e4b6b383f 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -444,7 +444,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
 					    elems->vht_cap_elem, sta);
 
 	ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap,
-					  elems->he_cap_len, sta);
+					  elems->he_cap_len,
+					  elems->he_6ghz_capa,
+					  sta);
 
 	if (bw != sta->sta.bandwidth)
 		changed |= IEEE80211_RC_BW_CHANGED;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c534cd1bb9cd..8a37089e86bb 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3430,6 +3430,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 		ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
 						  elems->he_cap,
 						  elems->he_cap_len,
+						  elems->he_6ghz_capa,
 						  sta);
 
 		bss_conf->he_support = sta->sta.he_cap.has_he;
-- 
cgit v1.2.3


From 2ad2274c58ee2dcaf9ccde5c63ff30f59b138f77 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 28 May 2020 21:34:39 +0200
Subject: mac80211: Add HE 6GHz capabilities element to probe request

On 6 GHz, the 6 GHz capabilities element should be added, do that.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
[add commit message]
Link: https://lore.kernel.org/r/20200528213443.8ee764f0cde0.I2b0c66b60e11818c97c9803e04a6a197c6376243@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h     | 20 ++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/scan.c        | 17 +++++++++--------
 net/mac80211/util.c        | 36 ++++++++++++++++++++++++++++--------
 4 files changed, 58 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9b76be3d561a..95b55eea2afb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -512,6 +512,26 @@ ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
 	return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION);
 }
 
+/**
+ * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities
+ * @sband: the sband to search for the STA on
+ * @iftype: the iftype to search for
+ *
+ * Return: the 6GHz capabilities
+ */
+static inline __le16
+ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
+			   enum nl80211_iftype iftype)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, iftype);
+
+	if (WARN_ON(!data || !data->he_cap.has_he))
+		return 0;
+
+	return data->he_6ghz_capa.capa;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 24dc1fd57000..ec1a71ac65f2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2144,7 +2144,7 @@ enum {
 	IEEE80211_PROBE_FLAG_RANDOM_SN		= BIT(2),
 };
 
-int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer,
 			     size_t buffer_len,
 			     struct ieee80211_scan_ies *ie_desc,
 			     const u8 *ie, size_t ie_len,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index d0c2e8012118..ad90bbe57457 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -313,8 +313,9 @@ ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef,
 }
 
 /* return false if no more work */
-static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
+static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_local *local = sdata->local;
 	struct cfg80211_scan_request *req;
 	struct cfg80211_chan_def chandef;
 	u8 bands_used = 0;
@@ -361,7 +362,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
 	if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
 		flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
 
-	ielen = ieee80211_build_preq_ies(local,
+	ielen = ieee80211_build_preq_ies(sdata,
 					 (u8 *)local->hw_scan_req->req.ie,
 					 local->hw_scan_ies_bufsize,
 					 &local->hw_scan_req->ies,
@@ -401,9 +402,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	if (WARN_ON(!local->scan_req))
 		return;
 
+	scan_sdata = rcu_dereference_protected(local->scan_sdata,
+					       lockdep_is_held(&local->mtx));
+
 	if (hw_scan && !aborted &&
 	    !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) &&
-	    ieee80211_prep_hw_scan(local)) {
+	    ieee80211_prep_hw_scan(scan_sdata)) {
 		int rc;
 
 		rc = drv_hw_scan(local,
@@ -432,9 +436,6 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 		cfg80211_scan_done(scan_req, &local->scan_info);
 	}
 	RCU_INIT_POINTER(local->scan_req, NULL);
-
-	scan_sdata = rcu_dereference_protected(local->scan_sdata,
-					       lockdep_is_held(&local->mtx));
 	RCU_INIT_POINTER(local->scan_sdata, NULL);
 
 	local->scanning = 0;
@@ -776,7 +777,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 	ieee80211_recalc_idle(local);
 
 	if (hw_scan) {
-		WARN_ON(!ieee80211_prep_hw_scan(local));
+		WARN_ON(!ieee80211_prep_hw_scan(sdata));
 		rc = drv_hw_scan(local, sdata, local->hw_scan_req);
 	} else {
 		rc = ieee80211_start_sw_scan(local, sdata);
@@ -1274,7 +1275,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
 
-	ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
+	ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz,
 				 &sched_scan_ies, req->ie,
 				 req->ie_len, bands_used, rate_masks, &chandef,
 				 flags);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index cbe24d303f0d..21c94094a699 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1663,7 +1663,20 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
-static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
+static u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end)
+{
+	if ((end - pos) < 5)
+		return pos;
+
+	*pos++ = WLAN_EID_EXTENSION;
+	*pos++ = 1 + sizeof(cap);
+	*pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA;
+	memcpy(pos, &cap, sizeof(cap));
+
+	return pos + 2;
+}
+
+static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata,
 					 u8 *buffer, size_t buffer_len,
 					 const u8 *ie, size_t ie_len,
 					 enum nl80211_band band,
@@ -1671,6 +1684,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
 					 struct cfg80211_chan_def *chandef,
 					 size_t *offset, u32 flags)
 {
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
 	const struct ieee80211_sta_he_cap *he_cap;
 	u8 *pos = buffer, *end = buffer + buffer_len;
@@ -1848,6 +1862,14 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
 		pos = ieee80211_ie_build_he_cap(pos, he_cap, end);
 		if (!pos)
 			goto out_err;
+
+		if (sband->band == NL80211_BAND_6GHZ) {
+			enum nl80211_iftype iftype =
+				ieee80211_vif_type_p2p(&sdata->vif);
+			__le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype);
+
+			pos = ieee80211_write_he_6ghz_cap(pos, cap, end);
+		}
 	}
 
 	/*
@@ -1862,7 +1884,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
 	return pos - buffer;
 }
 
-int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer,
 			     size_t buffer_len,
 			     struct ieee80211_scan_ies *ie_desc,
 			     const u8 *ie, size_t ie_len,
@@ -1877,7 +1899,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 
 	for (i = 0; i < NUM_NL80211_BANDS; i++) {
 		if (bands_used & BIT(i)) {
-			pos += ieee80211_build_preq_ies_band(local,
+			pos += ieee80211_build_preq_ies_band(sdata,
 							     buffer + pos,
 							     buffer_len - pos,
 							     ie, ie_len, i,
@@ -1939,7 +1961,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 		return NULL;
 
 	rate_masks[chan->band] = ratemask;
-	ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
+	ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb),
 					   skb_tailroom(skb), &dummy_ie_desc,
 					   ie, ie_len, BIT(chan->band),
 					   rate_masks, &chandef, flags);
@@ -2879,10 +2901,8 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata,
 	}
 
 	pos = skb_put(skb, 2 + 1 + sizeof(cap));
-	*pos++ = WLAN_EID_EXTENSION;
-	*pos++ = 1 + sizeof(cap);
-	*pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA;
-	put_unaligned_le16(cap, pos);
+	ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap),
+				    pos + 2 + 1 + sizeof(cap));
 }
 
 u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
-- 
cgit v1.2.3


From ba8f6a037f790147438173029799f54c9d3065f2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:40 +0200
Subject: cfg80211: treat 6 GHz channels as valid regardless of capability

If a 6 GHz channel exists, then we can probably safely assume that
the device actually supports it, and then it should support most
bandwidths.

This will probably need to be extended to check the interface type
and then dig into the HE capabilities for that though, to have the
correct bandwidth check.

Link: https://lore.kernel.org/r/20200528213443.d4864ef52e92.I82f09b2b14a56413ce20376d09967fe954a033eb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/chan.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index e111c08daa0e..cddf92c5d09e 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -6,7 +6,7 @@
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2018       Intel Corporation
+ * Copyright 2018-2020	Intel Corporation
  */
 
 #include <linux/export.h>
@@ -919,7 +919,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		width = 10;
 		break;
 	case NL80211_CHAN_WIDTH_20:
-		if (!ht_cap->ht_supported)
+		if (!ht_cap->ht_supported &&
+		    chandef->chan->band != NL80211_BAND_6GHZ)
 			return false;
 		/* fall through */
 	case NL80211_CHAN_WIDTH_20_NOHT:
@@ -928,6 +929,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		break;
 	case NL80211_CHAN_WIDTH_40:
 		width = 40;
+		if (chandef->chan->band == NL80211_BAND_6GHZ)
+			break;
 		if (!ht_cap->ht_supported)
 			return false;
 		if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
@@ -942,24 +945,29 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		break;
 	case NL80211_CHAN_WIDTH_80P80:
 		cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
-		if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+		if (chandef->chan->band != NL80211_BAND_6GHZ &&
+		    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
 			return false;
 		/* fall through */
 	case NL80211_CHAN_WIDTH_80:
-		if (!vht_cap->vht_supported)
-			return false;
 		prohibited_flags |= IEEE80211_CHAN_NO_80MHZ;
 		width = 80;
+		if (chandef->chan->band == NL80211_BAND_6GHZ)
+			break;
+		if (!vht_cap->vht_supported)
+			return false;
 		break;
 	case NL80211_CHAN_WIDTH_160:
+		prohibited_flags |= IEEE80211_CHAN_NO_160MHZ;
+		width = 160;
+		if (chandef->chan->band == NL80211_BAND_6GHZ)
+			break;
 		if (!vht_cap->vht_supported)
 			return false;
 		cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
 		if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
 		    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
 			return false;
-		prohibited_flags |= IEEE80211_CHAN_NO_160MHZ;
-		width = 160;
 		break;
 	default:
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 461ce35d5535c1479384f67fcf4bfc3f3610edca Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:41 +0200
Subject: cfg80211: reject HT/VHT capabilities on 6 GHz band

On the 6 GHz band, HE should be used, but without any direct HT/VHT
capabilities, instead the HE 6 GHz band capabilities will capture
the relevant information. Reject HT/VHT capabilities here.

Link: https://lore.kernel.org/r/20200528213443.bfe89c35459a.Ibba5e066fa0087fd49d13cfee89d196ea0c68ae2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index b795f363d004..1651f86db6ca 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -807,6 +807,11 @@ int wiphy_register(struct wiphy *wiphy)
 			    !sband->n_bitrates))
 			return -EINVAL;
 
+		if (WARN_ON(band == NL80211_BAND_6GHZ &&
+			    (sband->ht_cap.ht_supported ||
+			     sband->vht_cap.vht_supported)))
+			return -EINVAL;
+
 		/*
 		 * Since cfg80211_disable_40mhz_24ghz is global, we can
 		 * modify the sband's ht data even if the driver uses a
-- 
cgit v1.2.3


From f438136528482f98535889c9a6f99bbacdd92870 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:42 +0200
Subject: cfg80211: require HE capabilities for 6 GHz band

On 6 GHz band, HE capabilities must be available for all of
the interface types, otherwise we shouldn't use 6 GHz. Check
this.

Link: https://lore.kernel.org/r/20200528213443.5881cb3c8c4a.I583b54172f91f98d44af64a16c5826fe458cbb27@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index 1651f86db6ca..5b6714460490 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -791,6 +791,7 @@ int wiphy_register(struct wiphy *wiphy)
 	/* sanity check supported bands/channels */
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
 		u16 types = 0;
+		bool have_he = false;
 
 		sband = wiphy->bands[band];
 		if (!sband)
@@ -859,8 +860,17 @@ int wiphy_register(struct wiphy *wiphy)
 				return -EINVAL;
 
 			types |= iftd->types_mask;
+
+			if (i == 0)
+				have_he = iftd->he_cap.has_he;
+			else
+				have_he = have_he &&
+					  iftd->he_cap.has_he;
 		}
 
+		if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ))
+			return -EINVAL;
+
 		have_band = true;
 	}
 
-- 
cgit v1.2.3


From 93382a0d119b3ab95e3ebca51ea15aa87187b493 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 May 2020 21:34:44 +0200
Subject: mac80211: accept aggregation sessions on 6 GHz

On 6 GHz, stations don't have ht_supported set, but they can
still do aggregation since they must have HE, allow that.

Link: https://lore.kernel.org/r/20200528213443.776d3c891b64.Ifa099d450617b50c691832b3c4aa08959fab520a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c | 5 +++--
 net/mac80211/agg-tx.c | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 4d1c335e06e5..7f245e9f114c 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
  * Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018        Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  */
 
 /**
@@ -292,7 +292,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
 		goto end;
 	}
 
-	if (!sta->sta.ht_cap.ht_supported) {
+	if (!sta->sta.ht_cap.ht_supported &&
+	    sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) {
 		ht_dbg(sta->sdata,
 		       "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
 		       sta->sta.addr, tid);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index c2d5f512526d..b37c8a983d88 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -593,7 +593,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 		 "Requested to start BA session on reserved tid=%d", tid))
 		return -EINVAL;
 
-	if (!pubsta->ht_cap.ht_supported)
+	if (!pubsta->ht_cap.ht_supported &&
+	    sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ)
 		return -EINVAL;
 
 	if (WARN_ON_ONCE(!local->ops->ampdu_action))
-- 
cgit v1.2.3


From 6fcb56ce0f9036b08d1c3e35ff93b100d499771b Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Thu, 28 May 2020 21:34:45 +0200
Subject: mac80211: Consider 6 GHz band when handling power constraint

Treat it like the 5 GHz band.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20200528213443.889e5c9dd006.Id8ed3bb8000ba8738be5df05639415eb2e23c61a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8a37089e86bb..d7bffd640ed3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1533,6 +1533,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata,
 		chan_increment = 1;
 		break;
 	case NL80211_BAND_5GHZ:
+	case NL80211_BAND_6GHZ:
 		chan_increment = 4;
 		break;
 	}
-- 
cgit v1.2.3


From 07c12d618f06c8876fe12a53217d92b32d7baf07 Mon Sep 17 00:00:00 2001
From: Tova Mussai <tova.mussai@intel.com>
Date: Thu, 28 May 2020 21:34:46 +0200
Subject: mac80211: set short_slot for 6 GHz band

Set short slot also for 6 GHz band, just like 5 GHz.

Signed-off-by: Tova Mussai <tova.mussai@intel.com>
Link: https://lore.kernel.org/r/20200528213443.75f38e6f5efd.I272fbae402b03123f04e9ae69204eeab960c70cd@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c  | 3 ++-
 net/mac80211/mlme.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 90a07d075fdb..9b360544ad6f 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2198,7 +2198,8 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 	}
 
 	if (!sdata->vif.bss_conf.use_short_slot &&
-	    sband->band == NL80211_BAND_5GHZ) {
+	    (sband->band == NL80211_BAND_5GHZ ||
+	     sband->band == NL80211_BAND_6GHZ)) {
 		sdata->vif.bss_conf.use_short_slot = true;
 		changed |= BSS_CHANGED_ERP_SLOT;
 	}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d7bffd640ed3..5820ef02a587 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2171,7 +2171,8 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 	}
 
 	use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME);
-	if (sband->band == NL80211_BAND_5GHZ)
+	if (sband->band == NL80211_BAND_5GHZ ||
+	    sband->band == NL80211_BAND_6GHZ)
 		use_short_slot = true;
 
 	if (use_protection != bss_conf->use_cts_prot) {
-- 
cgit v1.2.3


From 093a48d2aa4b74db3134b61d7b7a061dbe79177b Mon Sep 17 00:00:00 2001
From: Nathan Errera <nathan.errera@intel.com>
Date: Thu, 28 May 2020 21:22:38 +0200
Subject: cfg80211: support bigger kek/kck key length

With some newer AKMs, the KCK and KEK are bigger, so allow that
if the driver advertises support for it. In addition, add a new
attribute for the AKM so we can use it for offloaded rekeying.

Signed-off-by: Nathan Errera <nathan.errera@intel.com>
[reword commit message]
Link: https://lore.kernel.org/r/20200528212237.5eb58b00a5d1.I61b09d77c4f382e8d58a05dcca78096e99a6bc15@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 12 +++++++++---
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/nl80211.c       | 23 +++++++++++++++++++----
 3 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 95b55eea2afb..b58ad1a3f695 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2936,12 +2936,17 @@ struct cfg80211_wowlan_wakeup {
 
 /**
  * struct cfg80211_gtk_rekey_data - rekey data
- * @kek: key encryption key (NL80211_KEK_LEN bytes)
- * @kck: key confirmation key (NL80211_KCK_LEN bytes)
+ * @kek: key encryption key (@kek_len bytes)
+ * @kck: key confirmation key (@kck_len bytes)
  * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes)
+ * @kek_len: length of kek
+ * @kck_len length of kck
+ * @akm: akm (oui, id)
  */
 struct cfg80211_gtk_rekey_data {
 	const u8 *kek, *kck, *replay_ctr;
+	u32 akm;
+	u8 kek_len, kck_len;
 };
 
 /**
@@ -4166,9 +4171,10 @@ struct cfg80211_ops {
  *	beaconing mode (AP, IBSS, Mesh, ...).
  * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation
  *	before connection.
+ * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
  */
 enum wiphy_flags {
-	/* use hole at 0 */
+	WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK		= BIT(0),
 	/* use hole at 1 */
 	/* use hole at 2 */
 	WIPHY_FLAG_NETNS_OK			= BIT(3),
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5b350d032fa3..dad8c8f8581f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5396,6 +5396,8 @@ enum plink_actions {
 
 #define NL80211_KCK_LEN			16
 #define NL80211_KEK_LEN			16
+#define NL80211_KCK_EXT_LEN		24
+#define NL80211_KEK_EXT_LEN		32
 #define NL80211_REPLAY_CTR_LEN		8
 
 /**
@@ -5404,6 +5406,7 @@ enum plink_actions {
  * @NL80211_REKEY_DATA_KEK: key encryption key (binary)
  * @NL80211_REKEY_DATA_KCK: key confirmation key (binary)
  * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary)
+ * @NL80211_REKEY_DATA_AKM: AKM data (OUI, suite type)
  * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal)
  * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal)
  */
@@ -5412,6 +5415,7 @@ enum nl80211_rekey_data {
 	NL80211_REKEY_DATA_KEK,
 	NL80211_REKEY_DATA_KCK,
 	NL80211_REKEY_DATA_REPLAY_CTR,
+	NL80211_REKEY_DATA_AKM,
 
 	/* keep last */
 	NUM_NL80211_REKEY_DATA,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3a24e6add13e..263ae395ad44 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -730,9 +730,16 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
 /* policy for GTK rekey offload attributes */
 static const struct nla_policy
 nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
-	[NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN),
-	[NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN),
+	[NL80211_REKEY_DATA_KEK] = {
+		.type = NLA_BINARY,
+		.len = NL80211_KEK_EXT_LEN
+	},
+	[NL80211_REKEY_DATA_KCK] = {
+		.type = NLA_BINARY,
+		.len = NL80211_KCK_EXT_LEN
+	},
 	[NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN),
+	[NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 },
 };
 
 static const struct nla_policy
@@ -12347,14 +12354,22 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN)
 		return -ERANGE;
-	if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN)
+	if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
+	      nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN))
 		return -ERANGE;
-	if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN)
+	if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
+	      nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN))
 		return -ERANGE;
 
 	rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]);
 	rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]);
 	rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]);
+	rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]);
+	rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]);
+	if (tb[NL80211_REKEY_DATA_AKM])
+		rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]);
 
 	wdev_lock(wdev);
 	if (!wdev->current_bss) {
-- 
cgit v1.2.3


From dafe2078a75af1abe4780313ef8dd8491ba8598f Mon Sep 17 00:00:00 2001
From: Patrick Eigensatz <patrickeigensatz@gmail.com>
Date: Mon, 1 Jun 2020 13:12:01 +0200
Subject: ipv4: nexthop: Fix deadcode issue by performing a proper NULL check

After allocating the spare nexthop group it should be tested for kzalloc()
returning NULL, instead the already used nexthop group (which cannot be
NULL at this point) had been tested so far.

Additionally, if kzalloc() fails, return ERR_PTR(-ENOMEM) instead of NULL.

Coverity-id: 1463885
Reported-by: Coverity <scan-admin@coverity.com>
Signed-off-by: Patrick Eigensatz <patrickeigensatz@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/nexthop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index ebafa5ed91ac..400a9f89ebdb 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1185,10 +1185,10 @@ static struct nexthop *nexthop_create_group(struct net *net,
 
 	/* spare group used for removals */
 	nhg->spare = nexthop_grp_alloc(num_nh);
-	if (!nhg) {
+	if (!nhg->spare) {
 		kfree(nhg);
 		kfree(nh);
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	nhg->spare->spare = nhg;
 
-- 
cgit v1.2.3


From 53fc685243bd6fb90d90305cea54598b78d3cbfc Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 1 Jun 2020 15:58:54 +0300
Subject: bridge: Avoid infinite loop when suppressing NS messages with invalid
 options

When neighbor suppression is enabled the bridge device might reply to
Neighbor Solicitation (NS) messages on behalf of remote hosts.

In case the NS message includes the "Source link-layer address" option
[1], the bridge device will use the specified address as the link-layer
destination address in its reply.

To avoid an infinite loop, break out of the options parsing loop when
encountering an option with length zero and disregard the NS message.

This is consistent with the IPv6 ndisc code and RFC 4886 which states
that "Nodes MUST silently discard an ND packet that contains an option
with length zero" [2].

[1] https://tools.ietf.org/html/rfc4861#section-4.3
[2] https://tools.ietf.org/html/rfc4861#section-4.6

Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alla Segal <allas@mellanox.com>
Tested-by: Alla Segal <allas@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_arp_nd_proxy.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 37908561a64b..b18cdf03edb3 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -276,6 +276,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
 	ns_olen = request->len - (skb_network_offset(request) +
 				  sizeof(struct ipv6hdr)) - sizeof(*ns);
 	for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+		if (!ns->opt[i + 1]) {
+			kfree_skb(reply);
+			return;
+		}
 		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
 			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
 			break;
-- 
cgit v1.2.3


From a01c245438c59a44f3ece8440046c78675fa8b0b Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 29 May 2020 00:05:32 +0200
Subject: net/sched: fix a couple of splats in the error path of
 tfc_gate_init()

trying to configure TC 'act_gate' rules with invalid control actions, the
following splat can be observed:

 general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] SMP KASAN NOPTI
 KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
 CPU: 1 PID: 2143 Comm: tc Not tainted 5.7.0-rc6+ #168
 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
 RIP: 0010:hrtimer_active+0x56/0x290
 [...]
  Call Trace:
  hrtimer_try_to_cancel+0x6d/0x330
  hrtimer_cancel+0x11/0x20
  tcf_gate_cleanup+0x15/0x30 [act_gate]
  tcf_action_cleanup+0x58/0x170
  __tcf_action_put+0xb0/0xe0
  __tcf_idr_release+0x68/0x90
  tcf_gate_init+0x7c7/0x19a0 [act_gate]
  tcf_action_init_1+0x60f/0x960
  tcf_action_init+0x157/0x2a0
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x2a3/0x39d
  rtnetlink_rcv_msg+0x5f3/0x920
  netlink_rcv_skb+0x121/0x350
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x714/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5b4/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x9a/0x370
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

this is caused by hrtimer_cancel(), running before hrtimer_init(). Fix it
ensuring to call hrtimer_cancel() only if clockid is valid, and the timer
has been initialized. After fixing this splat, the same error path causes
another problem:

 general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI
 KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
 CPU: 1 PID: 980 Comm: tc Not tainted 5.7.0-rc6+ #168
 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
 RIP: 0010:release_entry_list+0x4a/0x240 [act_gate]
 [...]
 Call Trace:
  tcf_action_cleanup+0x58/0x170
  __tcf_action_put+0xb0/0xe0
  __tcf_idr_release+0x68/0x90
  tcf_gate_init+0x7ab/0x19a0 [act_gate]
  tcf_action_init_1+0x60f/0x960
  tcf_action_init+0x157/0x2a0
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x2a3/0x39d
  rtnetlink_rcv_msg+0x5f3/0x920
  netlink_rcv_skb+0x121/0x350
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x714/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5b4/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x9a/0x370
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

the problem is similar: tcf_action_cleanup() was trying to release a list
without initializing it first. Ensure that INIT_LIST_HEAD() is called for
every newly created 'act_gate' action, same as what was done to 'act_ife'
with commit 44c23d71599f ("net/sched: act_ife: initalize ife->metalist
earlier").

Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
CC: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_gate.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 35fc48795541..9c628591f452 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -331,6 +331,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
+	if (ret == ACT_P_CREATED) {
+		to_gate(*a)->param.tcfg_clockid = -1;
+		INIT_LIST_HEAD(&(to_gate(*a)->param.entries));
+	}
 
 	if (tb[TCA_GATE_PRIORITY])
 		prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
@@ -377,7 +381,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			goto chain_put;
 	}
 
-	INIT_LIST_HEAD(&p->entries);
 	if (tb[TCA_GATE_ENTRY_LIST]) {
 		err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
 		if (err < 0)
@@ -449,9 +452,9 @@ static void tcf_gate_cleanup(struct tc_action *a)
 	struct tcf_gate *gact = to_gate(a);
 	struct tcf_gate_params *p;
 
-	hrtimer_cancel(&gact->hitimer);
-
 	p = &gact->param;
+	if (p->tcfg_clockid != -1)
+		hrtimer_cancel(&gact->hitimer);
 
 	release_entry_list(&p->entries);
 }
-- 
cgit v1.2.3


From a8284c6899cf7321abbd258d970a9442978b0a4f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 May 2020 02:25:34 +0200
Subject: netfilter: nf_flowtable: expose nf_flow_table_gc_cleanup()

This function schedules the flow teardown state and it forces a gc run.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h | 2 ++
 net/netfilter/nf_flow_table_core.c    | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index c54a7f707e50..d7338bfd7b0f 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -175,6 +175,8 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
 
 struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
 						     struct flow_offload_tuple *tuple);
+void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
+			      struct net_device *dev);
 void nf_flow_table_cleanup(struct net_device *dev);
 
 int nf_flow_table_init(struct nf_flowtable *flow_table);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 42da6e337276..6a3034f84ab6 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -588,8 +588,8 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 		flow_offload_teardown(flow);
 }
 
-static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-					  struct net_device *dev)
+void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
+			      struct net_device *dev)
 {
 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
 	flush_delayed_work(&flowtable->gc_work);
@@ -602,7 +602,7 @@ void nf_flow_table_cleanup(struct net_device *dev)
 
 	mutex_lock(&flowtable_lock);
 	list_for_each_entry(flowtable, &flowtables, list)
-		nf_flow_table_iterate_cleanup(flowtable, dev);
+		nf_flow_table_gc_cleanup(flowtable, dev);
 	mutex_unlock(&flowtable_lock);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
-- 
cgit v1.2.3


From 1fac52da5942c58dd3e337fd7c5a550925ca752e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 May 2020 02:25:35 +0200
Subject: net: flow_offload: consolidate indirect flow_block infrastructure

Tunnel devices provide no dev->netdev_ops->ndo_setup_tc(...) interface.
The tunnel device and route control plane does not provide an obvious
way to relate tunnel and physical devices.

This patch allows drivers to register a tunnel device offload handler
for the tc and netfilter frontends through flow_indr_dev_register() and
flow_indr_dev_unregister().

The frontend calls flow_indr_dev_setup_offload() that iterates over the
list of drivers that are offering tunnel device hardware offload
support and it sets up the flow block for this tunnel device.

If the driver module is removed, the indirect flow_block ends up with a
stale callback reference. The module removal path triggers the
dev_shutdown() path to remove the qdisc and the flow_blocks for the
physical devices. However, this is not useful for tunnel devices, where
relation between the physical and the tunnel device is not explicit.

This patch introduces a cleanup callback that is invoked when the driver
module is removed to clean up the tunnel device flow_block. This patch
defines struct flow_block_indr and it uses it from flow_block_cb to
store the information that front-end requires to perform the
flow_block_cb cleanup on module removal.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h |  19 ++++++
 net/core/flow_offload.c    | 157 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+)

(limited to 'net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 95d633785ef9..5493282348fa 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -443,6 +443,16 @@ enum tc_setup_type;
 typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data,
 			    void *cb_priv);
 
+struct flow_block_cb;
+
+struct flow_block_indr {
+	struct list_head		list;
+	struct net_device		*dev;
+	enum flow_block_binder_type	binder_type;
+	void				*data;
+	void				(*cleanup)(struct flow_block_cb *block_cb);
+};
+
 struct flow_block_cb {
 	struct list_head	driver_list;
 	struct list_head	list;
@@ -450,6 +460,7 @@ struct flow_block_cb {
 	void			*cb_ident;
 	void			*cb_priv;
 	void			(*release)(void *cb_priv);
+	struct flow_block_indr	indr;
 	unsigned int		refcnt;
 };
 
@@ -523,6 +534,14 @@ static inline void flow_block_init(struct flow_block *flow_block)
 typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
 				      enum tc_setup_type type, void *type_data);
 
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv);
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+			      flow_setup_cb_t *setup_cb);
+int flow_indr_dev_setup_offload(struct net_device *dev,
+				enum tc_setup_type type, void *data,
+				struct flow_block_offload *bo,
+				void (*cleanup)(struct flow_block_cb *block_cb));
+
 typedef void flow_indr_block_cmd_t(struct net_device *dev,
 				   flow_indr_block_bind_cb_t *cb, void *cb_priv,
 				   enum flow_block_command command);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index e64941c526b1..8cd7da2586ae 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -317,6 +317,163 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 }
 EXPORT_SYMBOL(flow_block_cb_setup_simple);
 
+static DEFINE_MUTEX(flow_indr_block_lock);
+static LIST_HEAD(flow_block_indr_list);
+static LIST_HEAD(flow_block_indr_dev_list);
+
+struct flow_indr_dev {
+	struct list_head		list;
+	flow_indr_block_bind_cb_t	*cb;
+	void				*cb_priv;
+	refcount_t			refcnt;
+	struct rcu_head			rcu;
+};
+
+static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
+						 void *cb_priv)
+{
+	struct flow_indr_dev *indr_dev;
+
+	indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL);
+	if (!indr_dev)
+		return NULL;
+
+	indr_dev->cb		= cb;
+	indr_dev->cb_priv	= cb_priv;
+	refcount_set(&indr_dev->refcnt, 1);
+
+	return indr_dev;
+}
+
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+	struct flow_indr_dev *indr_dev;
+
+	mutex_lock(&flow_indr_block_lock);
+	list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) {
+		if (indr_dev->cb == cb &&
+		    indr_dev->cb_priv == cb_priv) {
+			refcount_inc(&indr_dev->refcnt);
+			mutex_unlock(&flow_indr_block_lock);
+			return 0;
+		}
+	}
+
+	indr_dev = flow_indr_dev_alloc(cb, cb_priv);
+	if (!indr_dev) {
+		mutex_unlock(&flow_indr_block_lock);
+		return -ENOMEM;
+	}
+
+	list_add(&indr_dev->list, &flow_block_indr_dev_list);
+	mutex_unlock(&flow_indr_block_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(flow_indr_dev_register);
+
+static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv,
+				      struct list_head *cleanup_list)
+{
+	struct flow_block_cb *this, *next;
+
+	list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
+		if (this->cb == setup_cb &&
+		    this->cb_priv == cb_priv) {
+			list_move(&this->indr.list, cleanup_list);
+			return;
+		}
+	}
+}
+
+static void flow_block_indr_notify(struct list_head *cleanup_list)
+{
+	struct flow_block_cb *this, *next;
+
+	list_for_each_entry_safe(this, next, cleanup_list, indr.list) {
+		list_del(&this->indr.list);
+		this->indr.cleanup(this);
+	}
+}
+
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+			      flow_setup_cb_t *setup_cb)
+{
+	struct flow_indr_dev *this, *next, *indr_dev = NULL;
+	LIST_HEAD(cleanup_list);
+
+	mutex_lock(&flow_indr_block_lock);
+	list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) {
+		if (this->cb == cb &&
+		    this->cb_priv == cb_priv &&
+		    refcount_dec_and_test(&this->refcnt)) {
+			indr_dev = this;
+			list_del(&indr_dev->list);
+			break;
+		}
+	}
+
+	if (!indr_dev) {
+		mutex_unlock(&flow_indr_block_lock);
+		return;
+	}
+
+	__flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list);
+	mutex_unlock(&flow_indr_block_lock);
+
+	flow_block_indr_notify(&cleanup_list);
+	kfree(indr_dev);
+}
+EXPORT_SYMBOL(flow_indr_dev_unregister);
+
+static void flow_block_indr_init(struct flow_block_cb *flow_block,
+				 struct flow_block_offload *bo,
+				 struct net_device *dev, void *data,
+				 void (*cleanup)(struct flow_block_cb *block_cb))
+{
+	flow_block->indr.binder_type = bo->binder_type;
+	flow_block->indr.data = data;
+	flow_block->indr.dev = dev;
+	flow_block->indr.cleanup = cleanup;
+}
+
+static void __flow_block_indr_binding(struct flow_block_offload *bo,
+				      struct net_device *dev, void *data,
+				      void (*cleanup)(struct flow_block_cb *block_cb))
+{
+	struct flow_block_cb *block_cb;
+
+	list_for_each_entry(block_cb, &bo->cb_list, list) {
+		switch (bo->command) {
+		case FLOW_BLOCK_BIND:
+			flow_block_indr_init(block_cb, bo, dev, data, cleanup);
+			list_add(&block_cb->indr.list, &flow_block_indr_list);
+			break;
+		case FLOW_BLOCK_UNBIND:
+			list_del(&block_cb->indr.list);
+			break;
+		}
+	}
+}
+
+int flow_indr_dev_setup_offload(struct net_device *dev,
+				enum tc_setup_type type, void *data,
+				struct flow_block_offload *bo,
+				void (*cleanup)(struct flow_block_cb *block_cb))
+{
+	struct flow_indr_dev *this;
+
+	mutex_lock(&flow_indr_block_lock);
+	list_for_each_entry(this, &flow_block_indr_dev_list, list)
+		this->cb(dev, this->cb_priv, type, bo);
+
+	__flow_block_indr_binding(bo, dev, data, cleanup);
+	mutex_unlock(&flow_indr_block_lock);
+
+	return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0;
+}
+EXPORT_SYMBOL(flow_indr_dev_setup_offload);
+
 static LIST_HEAD(block_cb_list);
 
 static struct rhashtable indr_setup_block_ht;
-- 
cgit v1.2.3


From 324a823b9962a0f290c40fb6314926d434193276 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 May 2020 02:25:36 +0200
Subject: net: cls_api: add tcf_block_offload_init()

Add a helper function to initialize the flow_block_offload structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 752d608f4442..c5a2f16097b6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -693,6 +693,22 @@ static void tc_indr_block_get_and_cmd(struct net_device *dev,
 	tc_indr_block_cmd(dev, block, cb, cb_priv, command, false);
 }
 
+static void tcf_block_offload_init(struct flow_block_offload *bo,
+				   struct net_device *dev,
+				   enum flow_block_command command,
+				   enum flow_block_binder_type binder_type,
+				   struct flow_block *flow_block,
+				   bool shared, struct netlink_ext_ack *extack)
+{
+	bo->net = dev_net(dev);
+	bo->command = command;
+	bo->binder_type = binder_type;
+	bo->block = flow_block;
+	bo->block_shared = shared;
+	bo->extack = extack;
+	INIT_LIST_HEAD(&bo->cb_list);
+}
+
 static void tc_indr_block_call(struct tcf_block *block,
 			       struct net_device *dev,
 			       struct tcf_block_ext_info *ei,
@@ -727,13 +743,9 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
 	struct flow_block_offload bo = {};
 	int err;
 
-	bo.net = dev_net(dev);
-	bo.command = command;
-	bo.binder_type = ei->binder_type;
-	bo.block = &block->flow_block;
-	bo.block_shared = tcf_block_shared(block);
-	bo.extack = extack;
-	INIT_LIST_HEAD(&bo.cb_list);
+	tcf_block_offload_init(&bo, dev, command, ei->binder_type,
+			       &block->flow_block, tcf_block_shared(block),
+			       extack);
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
 	if (err < 0) {
-- 
cgit v1.2.3


From 0fdcf78d59737939ea449b512d02c3733a22c8e1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 May 2020 02:25:37 +0200
Subject: net: use flow_indr_dev_setup_offload()

Update existing frontends to use flow_indr_dev_setup_offload().

This new function must be called if ->ndo_setup_tc is unset to deal
with tunnel devices.

If there is no driver that is subscribed to new tunnel device
flow_block bindings, then this function bails out with EOPNOTSUPP.

If the driver module is removed, the ->cleanup() callback removes the
entries that belong to this tunnel device. This cleanup procedures is
triggered when the device unregisters the tunnel device offload handler.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_flow_table_offload.c | 19 +++++++++---
 net/netfilter/nf_tables_offload.c     | 28 ++++++++++++++---
 net/sched/cls_api.c                   | 58 +++++++++++++++++------------------
 3 files changed, 67 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 2ff4087007a6..01cfa02c43bd 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -942,6 +942,18 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
 	INIT_LIST_HEAD(&bo->cb_list);
 }
 
+static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb)
+{
+	struct nf_flowtable *flowtable = block_cb->indr.data;
+	struct net_device *dev = block_cb->indr.dev;
+
+	nf_flow_table_gc_cleanup(flowtable, dev);
+	down_write(&flowtable->flow_block_lock);
+	list_del(&block_cb->list);
+	flow_block_cb_free(block_cb);
+	up_write(&flowtable->flow_block_lock);
+}
+
 static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
 					  struct nf_flowtable *flowtable,
 					  struct net_device *dev,
@@ -950,12 +962,9 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
 {
 	nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
 					 extack);
-	flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT);
 
-	if (list_empty(&bo->cb_list))
-		return -EOPNOTSUPP;
-
-	return 0;
+	return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo,
+					   nf_flow_table_indr_cleanup);
 }
 
 static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 954bccb7f32a..1960f11477e8 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -304,21 +304,41 @@ static void nft_indr_block_ing_cmd(struct net_device *dev,
 	nft_block_setup(chain, &bo, cmd);
 }
 
-static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
+static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
+{
+	struct nft_base_chain *basechain = block_cb->indr.data;
+	struct net_device *dev = block_cb->indr.dev;
+	struct netlink_ext_ack extack = {};
+	struct net *net = dev_net(dev);
+	struct flow_block_offload bo;
+
+	nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND,
+				    basechain, &extack);
+	mutex_lock(&net->nft.commit_mutex);
+	list_move(&block_cb->list, &bo.cb_list);
+	nft_flow_offload_unbind(&bo, basechain);
+	mutex_unlock(&net->nft.commit_mutex);
+}
+
+static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
 				      struct net_device *dev,
 				      enum flow_block_command cmd)
 {
 	struct netlink_ext_ack extack = {};
 	struct flow_block_offload bo;
+	int err;
 
-	nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+	nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack);
 
-	flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK);
+	err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo,
+					  nft_indr_block_cleanup);
+	if (err < 0)
+		return err;
 
 	if (list_empty(&bo.cb_list))
 		return -EOPNOTSUPP;
 
-	return nft_block_setup(chain, &bo, cmd);
+	return nft_block_setup(basechain, &bo, cmd);
 }
 
 #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c5a2f16097b6..760e51d852f5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -709,24 +709,26 @@ static void tcf_block_offload_init(struct flow_block_offload *bo,
 	INIT_LIST_HEAD(&bo->cb_list);
 }
 
-static void tc_indr_block_call(struct tcf_block *block,
-			       struct net_device *dev,
-			       struct tcf_block_ext_info *ei,
-			       enum flow_block_command command,
-			       struct netlink_ext_ack *extack)
+static void tcf_block_unbind(struct tcf_block *block,
+			     struct flow_block_offload *bo);
+
+static void tc_block_indr_cleanup(struct flow_block_cb *block_cb)
 {
-	struct flow_block_offload bo = {
-		.command	= command,
-		.binder_type	= ei->binder_type,
-		.net		= dev_net(dev),
-		.block		= &block->flow_block,
-		.block_shared	= tcf_block_shared(block),
-		.extack		= extack,
-	};
-	INIT_LIST_HEAD(&bo.cb_list);
+	struct tcf_block *block = block_cb->indr.data;
+	struct net_device *dev = block_cb->indr.dev;
+	struct netlink_ext_ack extack = {};
+	struct flow_block_offload bo;
 
-	flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK);
-	tcf_block_setup(block, &bo);
+	tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND,
+			       block_cb->indr.binder_type,
+			       &block->flow_block, tcf_block_shared(block),
+			       &extack);
+	down_write(&block->cb_lock);
+	list_move(&block_cb->list, &bo.cb_list);
+	up_write(&block->cb_lock);
+	rtnl_lock();
+	tcf_block_unbind(block, &bo);
+	rtnl_unlock();
 }
 
 static bool tcf_block_offload_in_use(struct tcf_block *block)
@@ -747,7 +749,12 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
 			       &block->flow_block, tcf_block_shared(block),
 			       extack);
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	if (dev->netdev_ops->ndo_setup_tc)
+		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	else
+		err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block,
+						  &bo, tc_block_indr_cleanup);
+
 	if (err < 0) {
 		if (err != -EOPNOTSUPP)
 			NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
@@ -765,13 +772,13 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	int err;
 
 	down_write(&block->cb_lock);
-	if (!dev->netdev_ops->ndo_setup_tc)
-		goto no_offload_dev_inc;
 
 	/* If tc offload feature is disabled and the block we try to bind
 	 * to already has some offloaded filters, forbid to bind.
 	 */
-	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+	if (dev->netdev_ops->ndo_setup_tc &&
+	    !tc_can_offload(dev) &&
+	    tcf_block_offload_in_use(block)) {
 		NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
 		err = -EOPNOTSUPP;
 		goto err_unlock;
@@ -783,18 +790,15 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	if (err)
 		goto err_unlock;
 
-	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
 	up_write(&block->cb_lock);
 	return 0;
 
 no_offload_dev_inc:
-	if (tcf_block_offload_in_use(block)) {
-		err = -EOPNOTSUPP;
+	if (tcf_block_offload_in_use(block))
 		goto err_unlock;
-	}
+
 	err = 0;
 	block->nooffloaddevcnt++;
-	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
 err_unlock:
 	up_write(&block->cb_lock);
 	return err;
@@ -807,10 +811,6 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	int err;
 
 	down_write(&block->cb_lock);
-	tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
-
-	if (!dev->netdev_ops->ndo_setup_tc)
-		goto no_offload_dev_dec;
 	err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_dec;
-- 
cgit v1.2.3


From 709ffbe19b777e8fc952e2fdcfd8e6f50c8ef08c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 May 2020 02:25:41 +0200
Subject: net: remove indirect block netdev event registration

Drivers do not register to netdev events to set up indirect blocks
anymore. Remove __flow_indr_block_cb_register() and
__flow_indr_block_cb_unregister().

The frontends set up the callbacks through flow_indr_dev_setup_block()

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h            |   9 --
 net/core/flow_offload.c               | 238 ----------------------------------
 net/netfilter/nf_flow_table_offload.c |  66 ----------
 net/netfilter/nf_tables_offload.c     |  53 +-------
 net/sched/cls_api.c                   |  79 -----------
 5 files changed, 1 insertion(+), 444 deletions(-)

(limited to 'net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 5493282348fa..69e13c8b6b3a 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -546,15 +546,6 @@ typedef void flow_indr_block_cmd_t(struct net_device *dev,
 				   flow_indr_block_bind_cb_t *cb, void *cb_priv,
 				   enum flow_block_command command);
 
-struct flow_indr_block_entry {
-	flow_indr_block_cmd_t *cb;
-	struct list_head	list;
-};
-
-void flow_indr_add_block_cb(struct flow_indr_block_entry *entry);
-
-void flow_indr_del_block_cb(struct flow_indr_block_entry *entry);
-
 int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 				  flow_indr_block_bind_cb_t *cb,
 				  void *cb_ident);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 8cd7da2586ae..0cfc35e6be28 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -473,241 +473,3 @@ int flow_indr_dev_setup_offload(struct net_device *dev,
 	return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0;
 }
 EXPORT_SYMBOL(flow_indr_dev_setup_offload);
-
-static LIST_HEAD(block_cb_list);
-
-static struct rhashtable indr_setup_block_ht;
-
-struct flow_indr_block_cb {
-	struct list_head list;
-	void *cb_priv;
-	flow_indr_block_bind_cb_t *cb;
-	void *cb_ident;
-};
-
-struct flow_indr_block_dev {
-	struct rhash_head ht_node;
-	struct net_device *dev;
-	unsigned int refcnt;
-	struct list_head cb_list;
-};
-
-static const struct rhashtable_params flow_indr_setup_block_ht_params = {
-	.key_offset	= offsetof(struct flow_indr_block_dev, dev),
-	.head_offset	= offsetof(struct flow_indr_block_dev, ht_node),
-	.key_len	= sizeof(struct net_device *),
-};
-
-static struct flow_indr_block_dev *
-flow_indr_block_dev_lookup(struct net_device *dev)
-{
-	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
-				      flow_indr_setup_block_ht_params);
-}
-
-static struct flow_indr_block_dev *
-flow_indr_block_dev_get(struct net_device *dev)
-{
-	struct flow_indr_block_dev *indr_dev;
-
-	indr_dev = flow_indr_block_dev_lookup(dev);
-	if (indr_dev)
-		goto inc_ref;
-
-	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
-	if (!indr_dev)
-		return NULL;
-
-	INIT_LIST_HEAD(&indr_dev->cb_list);
-	indr_dev->dev = dev;
-	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-				   flow_indr_setup_block_ht_params)) {
-		kfree(indr_dev);
-		return NULL;
-	}
-
-inc_ref:
-	indr_dev->refcnt++;
-	return indr_dev;
-}
-
-static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
-{
-	if (--indr_dev->refcnt)
-		return;
-
-	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-			       flow_indr_setup_block_ht_params);
-	kfree(indr_dev);
-}
-
-static struct flow_indr_block_cb *
-flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
-			  flow_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct flow_indr_block_cb *indr_block_cb;
-
-	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
-		if (indr_block_cb->cb == cb &&
-		    indr_block_cb->cb_ident == cb_ident)
-			return indr_block_cb;
-	return NULL;
-}
-
-static struct flow_indr_block_cb *
-flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
-		       flow_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct flow_indr_block_cb *indr_block_cb;
-
-	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
-	if (indr_block_cb)
-		return ERR_PTR(-EEXIST);
-
-	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
-	if (!indr_block_cb)
-		return ERR_PTR(-ENOMEM);
-
-	indr_block_cb->cb_priv = cb_priv;
-	indr_block_cb->cb = cb;
-	indr_block_cb->cb_ident = cb_ident;
-	list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
-	return indr_block_cb;
-}
-
-static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
-{
-	list_del(&indr_block_cb->list);
-	kfree(indr_block_cb);
-}
-
-static DEFINE_MUTEX(flow_indr_block_cb_lock);
-
-static void flow_block_cmd(struct net_device *dev,
-			   flow_indr_block_bind_cb_t *cb, void *cb_priv,
-			   enum flow_block_command command)
-{
-	struct flow_indr_block_entry *entry;
-
-	mutex_lock(&flow_indr_block_cb_lock);
-	list_for_each_entry(entry, &block_cb_list, list) {
-		entry->cb(dev, cb, cb_priv, command);
-	}
-	mutex_unlock(&flow_indr_block_cb_lock);
-}
-
-int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				  flow_indr_block_bind_cb_t *cb,
-				  void *cb_ident)
-{
-	struct flow_indr_block_cb *indr_block_cb;
-	struct flow_indr_block_dev *indr_dev;
-	int err;
-
-	indr_dev = flow_indr_block_dev_get(dev);
-	if (!indr_dev)
-		return -ENOMEM;
-
-	indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
-	err = PTR_ERR_OR_ZERO(indr_block_cb);
-	if (err)
-		goto err_dev_put;
-
-	flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
-		       FLOW_BLOCK_BIND);
-
-	return 0;
-
-err_dev_put:
-	flow_indr_block_dev_put(indr_dev);
-	return err;
-}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
-
-int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				flow_indr_block_bind_cb_t *cb,
-				void *cb_ident)
-{
-	int err;
-
-	rtnl_lock();
-	err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
-	rtnl_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
-
-void __flow_indr_block_cb_unregister(struct net_device *dev,
-				     flow_indr_block_bind_cb_t *cb,
-				     void *cb_ident)
-{
-	struct flow_indr_block_cb *indr_block_cb;
-	struct flow_indr_block_dev *indr_dev;
-
-	indr_dev = flow_indr_block_dev_lookup(dev);
-	if (!indr_dev)
-		return;
-
-	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
-	if (!indr_block_cb)
-		return;
-
-	flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
-		       FLOW_BLOCK_UNBIND);
-
-	flow_indr_block_cb_del(indr_block_cb);
-	flow_indr_block_dev_put(indr_dev);
-}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
-
-void flow_indr_block_cb_unregister(struct net_device *dev,
-				   flow_indr_block_bind_cb_t *cb,
-				   void *cb_ident)
-{
-	rtnl_lock();
-	__flow_indr_block_cb_unregister(dev, cb, cb_ident);
-	rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
-
-void flow_indr_block_call(struct net_device *dev,
-			  struct flow_block_offload *bo,
-			  enum flow_block_command command,
-			  enum tc_setup_type type)
-{
-	struct flow_indr_block_cb *indr_block_cb;
-	struct flow_indr_block_dev *indr_dev;
-
-	indr_dev = flow_indr_block_dev_lookup(dev);
-	if (!indr_dev)
-		return;
-
-	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
-		indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo);
-}
-EXPORT_SYMBOL_GPL(flow_indr_block_call);
-
-void flow_indr_add_block_cb(struct flow_indr_block_entry *entry)
-{
-	mutex_lock(&flow_indr_block_cb_lock);
-	list_add_tail(&entry->list, &block_cb_list);
-	mutex_unlock(&flow_indr_block_cb_lock);
-}
-EXPORT_SYMBOL_GPL(flow_indr_add_block_cb);
-
-void flow_indr_del_block_cb(struct flow_indr_block_entry *entry)
-{
-	mutex_lock(&flow_indr_block_cb_lock);
-	list_del(&entry->list);
-	mutex_unlock(&flow_indr_block_cb_lock);
-}
-EXPORT_SYMBOL_GPL(flow_indr_del_block_cb);
-
-static int __init init_flow_indr_rhashtable(void)
-{
-	return rhashtable_init(&indr_setup_block_ht,
-			       &flow_indr_setup_block_ht_params);
-}
-subsys_initcall(init_flow_indr_rhashtable);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 01cfa02c43bd..62651e6683f6 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -1008,69 +1008,6 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
 
-static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev,
-					     struct nf_flowtable *flowtable,
-					     flow_indr_block_bind_cb_t *cb,
-					     void *cb_priv,
-					     enum flow_block_command cmd)
-{
-	struct netlink_ext_ack extack = {};
-	struct flow_block_offload bo;
-
-	if (!flowtable)
-		return;
-
-	nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable,
-					 &extack);
-
-	cb(dev, cb_priv, TC_SETUP_FT, &bo);
-
-	nf_flow_table_block_setup(flowtable, &bo, cmd);
-}
-
-static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable,
-					    struct net_device *dev,
-					    flow_indr_block_bind_cb_t *cb,
-					    void *cb_priv,
-					    enum flow_block_command cmd)
-{
-	if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD))
-		return;
-
-	nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd);
-}
-
-static void nf_flow_table_indr_block_cb(struct net_device *dev,
-					flow_indr_block_bind_cb_t *cb,
-					void *cb_priv,
-					enum flow_block_command cmd)
-{
-	struct net *net = dev_net(dev);
-	struct nft_flowtable *nft_ft;
-	struct nft_table *table;
-	struct nft_hook *hook;
-
-	mutex_lock(&net->nft.commit_mutex);
-	list_for_each_entry(table, &net->nft.tables, list) {
-		list_for_each_entry(nft_ft, &table->flowtables, list) {
-			list_for_each_entry(hook, &nft_ft->hook_list, list) {
-				if (hook->ops.dev != dev)
-					continue;
-
-				nf_flow_table_indr_block_cb_cmd(&nft_ft->data,
-								dev, cb,
-								cb_priv, cmd);
-			}
-		}
-	}
-	mutex_unlock(&net->nft.commit_mutex);
-}
-
-static struct flow_indr_block_entry block_ing_entry = {
-	.cb	= nf_flow_table_indr_block_cb,
-	.list	= LIST_HEAD_INIT(block_ing_entry.list),
-};
-
 int nf_flow_table_offload_init(void)
 {
 	nf_flow_offload_wq  = alloc_workqueue("nf_flow_table_offload",
@@ -1078,13 +1015,10 @@ int nf_flow_table_offload_init(void)
 	if (!nf_flow_offload_wq)
 		return -ENOMEM;
 
-	flow_indr_add_block_cb(&block_ing_entry);
-
 	return 0;
 }
 
 void nf_flow_table_offload_exit(void)
 {
-	flow_indr_del_block_cb(&block_ing_entry);
 	destroy_workqueue(nf_flow_offload_wq);
 }
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 1960f11477e8..185fc82c99aa 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -285,25 +285,6 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain,
 	return nft_block_setup(chain, &bo, cmd);
 }
 
-static void nft_indr_block_ing_cmd(struct net_device *dev,
-				   struct nft_base_chain *chain,
-				   flow_indr_block_bind_cb_t *cb,
-				   void *cb_priv,
-				   enum flow_block_command cmd)
-{
-	struct netlink_ext_ack extack = {};
-	struct flow_block_offload bo;
-
-	if (!chain)
-		return;
-
-	nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
-
-	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
-
-	nft_block_setup(chain, &bo, cmd);
-}
-
 static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
 {
 	struct nft_base_chain *basechain = block_cb->indr.data;
@@ -575,24 +556,6 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
 	return NULL;
 }
 
-static void nft_indr_block_cb(struct net_device *dev,
-			      flow_indr_block_bind_cb_t *cb, void *cb_priv,
-			      enum flow_block_command cmd)
-{
-	struct net *net = dev_net(dev);
-	struct nft_chain *chain;
-
-	mutex_lock(&net->nft.commit_mutex);
-	chain = __nft_offload_get_chain(dev);
-	if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) {
-		struct nft_base_chain *basechain;
-
-		basechain = nft_base_chain(chain);
-		nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd);
-	}
-	mutex_unlock(&net->nft.commit_mutex);
-}
-
 static int nft_offload_netdev_event(struct notifier_block *this,
 				    unsigned long event, void *ptr)
 {
@@ -614,30 +577,16 @@ static int nft_offload_netdev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
-static struct flow_indr_block_entry block_ing_entry = {
-	.cb	= nft_indr_block_cb,
-	.list	= LIST_HEAD_INIT(block_ing_entry.list),
-};
-
 static struct notifier_block nft_offload_netdev_notifier = {
 	.notifier_call	= nft_offload_netdev_event,
 };
 
 int nft_offload_init(void)
 {
-	int err;
-
-	err = register_netdevice_notifier(&nft_offload_netdev_notifier);
-	if (err < 0)
-		return err;
-
-	flow_indr_add_block_cb(&block_ing_entry);
-
-	return 0;
+	return register_netdevice_notifier(&nft_offload_netdev_notifier);
 }
 
 void nft_offload_exit(void)
 {
-	flow_indr_del_block_cb(&block_ing_entry);
 	unregister_netdevice_notifier(&nft_offload_netdev_notifier);
 }
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 760e51d852f5..a00a203b2ef5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -621,78 +621,6 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
 static int tcf_block_setup(struct tcf_block *block,
 			   struct flow_block_offload *bo);
 
-static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block,
-			      flow_indr_block_bind_cb_t *cb, void *cb_priv,
-			      enum flow_block_command command, bool ingress)
-{
-	struct flow_block_offload bo = {
-		.command	= command,
-		.binder_type	= ingress ?
-				  FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS :
-				  FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
-		.net		= dev_net(dev),
-		.block_shared	= tcf_block_non_null_shared(block),
-	};
-	INIT_LIST_HEAD(&bo.cb_list);
-
-	if (!block)
-		return;
-
-	bo.block = &block->flow_block;
-
-	down_write(&block->cb_lock);
-	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
-
-	tcf_block_setup(block, &bo);
-	up_write(&block->cb_lock);
-}
-
-static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress)
-{
-	const struct Qdisc_class_ops *cops;
-	const struct Qdisc_ops *ops;
-	struct Qdisc *qdisc;
-
-	if (!dev_ingress_queue(dev))
-		return NULL;
-
-	qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
-	if (!qdisc)
-		return NULL;
-
-	ops = qdisc->ops;
-	if (!ops)
-		return NULL;
-
-	if (!ingress && !strcmp("ingress", ops->id))
-		return NULL;
-
-	cops = ops->cl_ops;
-	if (!cops)
-		return NULL;
-
-	if (!cops->tcf_block)
-		return NULL;
-
-	return cops->tcf_block(qdisc,
-			       ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS,
-			       NULL);
-}
-
-static void tc_indr_block_get_and_cmd(struct net_device *dev,
-				      flow_indr_block_bind_cb_t *cb,
-				      void *cb_priv,
-				      enum flow_block_command command)
-{
-	struct tcf_block *block;
-
-	block = tc_dev_block(dev, true);
-	tc_indr_block_cmd(dev, block, cb, cb_priv, command, true);
-
-	block = tc_dev_block(dev, false);
-	tc_indr_block_cmd(dev, block, cb, cb_priv, command, false);
-}
-
 static void tcf_block_offload_init(struct flow_block_offload *bo,
 				   struct net_device *dev,
 				   enum flow_block_command command,
@@ -3836,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = {
 	.size = sizeof(struct tcf_net),
 };
 
-static struct flow_indr_block_entry block_entry = {
-	.cb = tc_indr_block_get_and_cmd,
-	.list = LIST_HEAD_INIT(block_entry.list),
-};
-
 static int __init tc_filter_init(void)
 {
 	int err;
@@ -3853,8 +3776,6 @@ static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
-	flow_indr_add_block_cb(&block_entry);
-
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
 		      RTNL_FLAG_DOIT_UNLOCKED);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
-- 
cgit v1.2.3


From 678eb199cc9df3bf1cb12fb2da22768b8d1b6bf3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:36 +0300
Subject: devlink: Create dedicated trap group for layer 3 exceptions

Packets that hit exceptions during layer 3 forwarding must be trapped to
the CPU for the control plane to function properly. Create a dedicated
group for them, so that user space could choose to assign a different
policer for them.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 7 +++++--
 include/net/devlink.h                             | 3 +++
 net/core/devlink.c                                | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index fe089acb7783..4ca241e70064 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -277,8 +277,11 @@ narrow. The description of these groups must be added to the following table:
      - Contains packet traps for packets that were dropped by the device during
        layer 2 forwarding (i.e., bridge)
    * - ``l3_drops``
-     - Contains packet traps for packets that were dropped by the device or hit
-       an exception (e.g., TTL error) during layer 3 forwarding
+     - Contains packet traps for packets that were dropped by the device during
+       layer 3 forwarding
+   * - ``l3_exceptions``
+     - Contains packet traps for packets that hit an exception (e.g., TTL
+       error) during layer 3 forwarding
    * - ``buffer_drops``
      - Contains packet traps for packets that were dropped by the device due to
        an enqueue decision
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 8ffc1b5cd89b..851388c9d795 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -657,6 +657,7 @@ enum devlink_trap_generic_id {
 enum devlink_trap_group_generic_id {
 	DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_L3_EXCEPTIONS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS,
@@ -730,6 +731,8 @@ enum devlink_trap_group_generic_id {
 	"l2_drops"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_DROPS \
 	"l3_drops"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_EXCEPTIONS \
+	"l3_exceptions"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \
 	"buffer_drops"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7b76e5fffc10..d9fff7083f02 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8505,6 +8505,7 @@ static const struct devlink_trap devlink_trap_generic[] = {
 static const struct devlink_trap_group devlink_trap_group_generic[] = {
 	DEVLINK_TRAP_GROUP(L2_DROPS),
 	DEVLINK_TRAP_GROUP(L3_DROPS),
+	DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
 	DEVLINK_TRAP_GROUP(BUFFER_DROPS),
 	DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
 	DEVLINK_TRAP_GROUP(ACL_DROPS),
-- 
cgit v1.2.3


From 9eefeabed6f831018c15bd7e17d34967ee34d9dd Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:39 +0300
Subject: devlink: Add 'mirror' trap action

The action is used by control traps such as IGMP query. The packet is
flooded by the device, but also trapped to the CPU in order for the
software bridge to mark the receiving port as a multicast router port.
Such packets are marked with 'skb->offload_fwd_mark = 1' in order to
prevent the software bridge from flooding them again.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 2 ++
 include/uapi/linux/devlink.h                      | 3 +++
 net/core/devlink.c                                | 3 ++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 4ca241e70064..5b97327caefc 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -108,6 +108,8 @@ The ``devlink-trap`` mechanism supports the following packet trap actions:
   * ``trap``: The sole copy of the packet is sent to the CPU.
   * ``drop``: The packet is dropped by the underlying device and a copy is not
     sent to the CPU.
+  * ``mirror``: The packet is forwarded by the underlying device and a copy is
+    sent to the CPU.
 
 Generic Packet Traps
 ====================
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1ae90e06c06d..16305932a950 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -233,10 +233,13 @@ enum {
  * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not
  *                            sent to the CPU.
  * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU.
+ * @DEVLINK_TRAP_ACTION_MIRROR: Packet is forwarded by the device and a copy is
+ *                              sent to the CPU.
  */
 enum devlink_trap_action {
 	DEVLINK_TRAP_ACTION_DROP,
 	DEVLINK_TRAP_ACTION_TRAP,
+	DEVLINK_TRAP_ACTION_MIRROR,
 };
 
 /**
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d9fff7083f02..d6298917b077 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -5869,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info,
 	val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
 	switch (val) {
 	case DEVLINK_TRAP_ACTION_DROP: /* fall-through */
-	case DEVLINK_TRAP_ACTION_TRAP:
+	case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */
+	case DEVLINK_TRAP_ACTION_MIRROR:
 		*p_trap_action = val;
 		break;
 	default:
-- 
cgit v1.2.3


From 30a4e9a29ab9aadfe6c5386ae4aa396b1d2556c2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:40 +0300
Subject: devlink: Add 'control' trap type

This type is used for traps that trap control packets such as ARP
request and IGMP query to the CPU.

Do not report such packets to the kernel's drop monitor as they were not
dropped by the device no encountered an exception during forwarding.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 8 +++++++-
 include/uapi/linux/devlink.h                      | 6 ++++++
 net/core/devlink.c                                | 7 +++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 5b97327caefc..6c293cfa23ee 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -55,7 +55,7 @@ The following diagram provides a general overview of ``devlink-trap``::
                           |                |
                           +-------^--------+
                                   |
-                                  |
+                                  | Non-control traps
                                   |
                              +----+----+
                              |         |      Kernel's Rx path
@@ -97,6 +97,12 @@ The ``devlink-trap`` mechanism supports the following packet trap types:
     processed by ``devlink`` and injected to the kernel's Rx path. Changing the
     action of such traps is not allowed, as it can easily break the control
     plane.
+  * ``control``: Trapped packets were trapped by the device because these are
+    control packets required for the correct functioning of the control plane.
+    For example, ARP request and IGMP query packets. Packets are injected to
+    the kernel's Rx path, but not reported to the kernel's drop monitor.
+    Changing the action of such traps is not allowed, as it can easily break
+    the control plane.
 
 .. _Trap-Actions:
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 16305932a950..08563e6a424d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -253,10 +253,16 @@ enum devlink_trap_action {
  *                               control plane for resolution. Trapped packets
  *                               are processed by devlink and injected to
  *                               the kernel's Rx path.
+ * @DEVLINK_TRAP_TYPE_CONTROL: Packet was trapped because it is required for
+ *                             the correct functioning of the control plane.
+ *                             For example, an ARP request packet. Trapped
+ *                             packets are injected to the kernel's Rx path,
+ *                             but not reported to drop monitor.
  */
 enum devlink_trap_type {
 	DEVLINK_TRAP_TYPE_DROP,
 	DEVLINK_TRAP_TYPE_EXCEPTION,
+	DEVLINK_TRAP_TYPE_CONTROL,
 };
 
 enum {
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d6298917b077..47c28e0f848f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8847,6 +8847,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
 	devlink_trap_stats_update(trap_item->stats, skb->len);
 	devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
 
+	/* Control packets were not dropped by the device or encountered an
+	 * exception during forwarding and therefore should not be reported to
+	 * the kernel's drop monitor.
+	 */
+	if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL)
+		return;
+
 	devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
 					  in_devlink_port, fa_cookie);
 	net_dm_hw_report(skb, &hw_metadata);
-- 
cgit v1.2.3


From 515eac677fe119433c2a466443bef95c10c550cc Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:41 +0300
Subject: devlink: Add layer 2 control packet traps

Add layer 2 control packet traps such as STP and IGMP query, so that
capable device drivers could register them with devlink. Add
documentation for every added packet trap and packet trap group.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 45 +++++++++++++++++++++
 include/net/devlink.h                             | 48 +++++++++++++++++++++++
 net/core/devlink.c                                | 16 ++++++++
 3 files changed, 109 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 6c293cfa23ee..e9fc3c9d7d7a 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -252,6 +252,42 @@ be added to the following table:
    * - ``egress_flow_action_drop``
      - ``drop``
      - Traps packets dropped during processing of egress flow action drop
+   * - ``stp``
+     - ``control``
+     - Traps STP packets
+   * - ``lacp``
+     - ``control``
+     - Traps LACP packets
+   * - ``lldp``
+     - ``control``
+     - Traps LLDP packets
+   * - ``igmp_query``
+     - ``control``
+     - Traps IGMP Membership Query packets
+   * - ``igmp_v1_report``
+     - ``control``
+     - Traps IGMP Version 1 Membership Report packets
+   * - ``igmp_v2_report``
+     - ``control``
+     - Traps IGMP Version 2 Membership Report packets
+   * - ``igmp_v3_report``
+     - ``control``
+     - Traps IGMP Version 3 Membership Report packets
+   * - ``igmp_v2_leave``
+     - ``control``
+     - Traps IGMP Version 2 Leave Group packets
+   * - ``mld_query``
+     - ``control``
+     - Traps MLD Multicast Listener Query packets
+   * - ``mld_v1_report``
+     - ``control``
+     - Traps MLD Version 1 Multicast Listener Report packets
+   * - ``mld_v2_report``
+     - ``control``
+     - Traps MLD Version 2 Multicast Listener Report packets
+   * - ``mld_v1_done``
+     - ``control``
+     - Traps MLD Version 1 Multicast Listener Done packets
 
 Driver-specific Packet Traps
 ============================
@@ -299,6 +335,15 @@ narrow. The description of these groups must be added to the following table:
    * - ``acl_drops``
      - Contains packet traps for packets that were dropped by the device during
        ACL processing
+   * - ``stp``
+     - Contains packet traps for STP packets
+   * - ``lacp``
+     - Contains packet traps for LACP packets
+   * - ``lldp``
+     - Contains packet traps for LLDP packets
+   * - ``mc_snooping``
+     - Contains packet traps for IGMP and MLD packets required for multicast
+       snooping
 
 Packet Trap Policers
 ====================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 851388c9d795..c0061542ad65 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -645,6 +645,18 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC,
 	DEVLINK_TRAP_GENERIC_ID_INGRESS_FLOW_ACTION_DROP,
 	DEVLINK_TRAP_GENERIC_ID_EGRESS_FLOW_ACTION_DROP,
+	DEVLINK_TRAP_GENERIC_ID_STP,
+	DEVLINK_TRAP_GENERIC_ID_LACP,
+	DEVLINK_TRAP_GENERIC_ID_LLDP,
+	DEVLINK_TRAP_GENERIC_ID_IGMP_QUERY,
+	DEVLINK_TRAP_GENERIC_ID_IGMP_V1_REPORT,
+	DEVLINK_TRAP_GENERIC_ID_IGMP_V2_REPORT,
+	DEVLINK_TRAP_GENERIC_ID_IGMP_V3_REPORT,
+	DEVLINK_TRAP_GENERIC_ID_IGMP_V2_LEAVE,
+	DEVLINK_TRAP_GENERIC_ID_MLD_QUERY,
+	DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT,
+	DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT,
+	DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -661,6 +673,10 @@ enum devlink_trap_group_generic_id {
 	DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_STP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_LACP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING,
 
 	/* Add new generic trap group IDs above */
 	__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX,
@@ -726,6 +742,30 @@ enum devlink_trap_group_generic_id {
 	"ingress_flow_action_drop"
 #define DEVLINK_TRAP_GENERIC_NAME_EGRESS_FLOW_ACTION_DROP \
 	"egress_flow_action_drop"
+#define DEVLINK_TRAP_GENERIC_NAME_STP \
+	"stp"
+#define DEVLINK_TRAP_GENERIC_NAME_LACP \
+	"lacp"
+#define DEVLINK_TRAP_GENERIC_NAME_LLDP \
+	"lldp"
+#define DEVLINK_TRAP_GENERIC_NAME_IGMP_QUERY \
+	"igmp_query"
+#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V1_REPORT \
+	"igmp_v1_report"
+#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_REPORT \
+	"igmp_v2_report"
+#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V3_REPORT \
+	"igmp_v3_report"
+#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_LEAVE \
+	"igmp_v2_leave"
+#define DEVLINK_TRAP_GENERIC_NAME_MLD_QUERY \
+	"mld_query"
+#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_REPORT \
+	"mld_v1_report"
+#define DEVLINK_TRAP_GENERIC_NAME_MLD_V2_REPORT \
+	"mld_v2_report"
+#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \
+	"mld_v1_done"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
@@ -739,6 +779,14 @@ enum devlink_trap_group_generic_id {
 	"tunnel_drops"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_DROPS \
 	"acl_drops"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_STP \
+	"stp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LACP \
+	"lacp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LLDP \
+	"lldp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING  \
+	"mc_snooping"
 
 #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id,	      \
 			     _metadata_cap)				      \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 47c28e0f848f..c91ef1b5f738 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8495,6 +8495,18 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
 	DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
 	DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
+	DEVLINK_TRAP(STP, CONTROL),
+	DEVLINK_TRAP(LACP, CONTROL),
+	DEVLINK_TRAP(LLDP, CONTROL),
+	DEVLINK_TRAP(IGMP_QUERY, CONTROL),
+	DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
+	DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
+	DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
+	DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
+	DEVLINK_TRAP(MLD_QUERY, CONTROL),
+	DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
+	DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
+	DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
@@ -8510,6 +8522,10 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
 	DEVLINK_TRAP_GROUP(BUFFER_DROPS),
 	DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
 	DEVLINK_TRAP_GROUP(ACL_DROPS),
+	DEVLINK_TRAP_GROUP(STP),
+	DEVLINK_TRAP_GROUP(LACP),
+	DEVLINK_TRAP_GROUP(LLDP),
+	DEVLINK_TRAP_GROUP(MC_SNOOPING),
 };
 
 static int devlink_trap_generic_verify(const struct devlink_trap *trap)
-- 
cgit v1.2.3


From d77cfd162a346259222d0207a95bf1a0cc0c2520 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:42 +0300
Subject: devlink: Add layer 3 control packet traps

Add layer 3 control packet traps such as ARP and DHCP, so that capable
device drivers could register them with devlink. Add documentation for
every added packet trap and packet trap group.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 143 ++++++++++++++++++++++
 include/net/devlink.h                             | 126 +++++++++++++++++++
 net/core/devlink.c                                |  42 +++++++
 3 files changed, 311 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index e9fc3c9d7d7a..621b634b16be 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -288,6 +288,115 @@ be added to the following table:
    * - ``mld_v1_done``
      - ``control``
      - Traps MLD Version 1 Multicast Listener Done packets
+   * - ``ipv4_dhcp``
+     - ``control``
+     - Traps IPv4 DHCP packets
+   * - ``ipv6_dhcp``
+     - ``control``
+     - Traps IPv6 DHCP packets
+   * - ``arp_request``
+     - ``control``
+     - Traps ARP request packets
+   * - ``arp_response``
+     - ``control``
+     - Traps ARP response packets
+   * - ``arp_overlay``
+     - ``control``
+     - Traps NVE-decapsulated ARP packets that reached the overlay network.
+       This is required, for example, when the address that needs to be
+       resolved is a local address
+   * - ``ipv6_neigh_solicit``
+     - ``control``
+     - Traps IPv6 Neighbour Solicitation packets
+   * - ``ipv6_neigh_advert``
+     - ``control``
+     - Traps IPv6 Neighbour Advertisement packets
+   * - ``ipv4_bfd``
+     - ``control``
+     - Traps IPv4 BFD packets
+   * - ``ipv6_bfd``
+     - ``control``
+     - Traps IPv6 BFD packets
+   * - ``ipv4_ospf``
+     - ``control``
+     - Traps IPv4 OSPF packets
+   * - ``ipv6_ospf``
+     - ``control``
+     - Traps IPv6 OSPF packets
+   * - ``ipv4_bgp``
+     - ``control``
+     - Traps IPv4 BGP packets
+   * - ``ipv6_bgp``
+     - ``control``
+     - Traps IPv6 BGP packets
+   * - ``ipv4_vrrp``
+     - ``control``
+     - Traps IPv4 VRRP packets
+   * - ``ipv6_vrrp``
+     - ``control``
+     - Traps IPv6 VRRP packets
+   * - ``ipv4_pim``
+     - ``control``
+     - Traps IPv4 PIM packets
+   * - ``ipv6_pim``
+     - ``control``
+     - Traps IPv6 PIM packets
+   * - ``uc_loopback``
+     - ``control``
+     - Traps unicast packets that need to be routed through the same layer 3
+       interface from which they were received. Such packets are routed by the
+       kernel, but also cause it to potentially generate ICMP redirect packets
+   * - ``local_route``
+     - ``control``
+     - Traps unicast packets that hit a local route and need to be locally
+       delivered
+   * - ``external_route``
+     - ``control``
+     - Traps packets that should be routed through an external interface (e.g.,
+       management interface) that does not belong to the same device (e.g.,
+       switch ASIC) as the ingress interface
+   * - ``ipv6_uc_dip_link_local_scope``
+     - ``control``
+     - Traps unicast IPv6 packets that need to be routed and have a destination
+       IP address with a link-local scope (i.e., fe80::/10). The trap allows
+       device drivers to avoid programming link-local routes, but still receive
+       packets for local delivery
+   * - ``ipv6_dip_all_nodes``
+     - ``control``
+     - Traps IPv6 packets that their destination IP address is the "All Nodes
+       Address" (i.e., ff02::1)
+   * - ``ipv6_dip_all_routers``
+     - ``control``
+     - Traps IPv6 packets that their destination IP address is the "All Routers
+       Address" (i.e., ff02::2)
+   * - ``ipv6_router_solicit``
+     - ``control``
+     - Traps IPv6 Router Solicitation packets
+   * - ``ipv6_router_advert``
+     - ``control``
+     - Traps IPv6 Router Advertisement packets
+   * - ``ipv6_redirect``
+     - ``control``
+     - Traps IPv6 Redirect Message packets
+   * - ``ipv4_router_alert``
+     - ``control``
+     - Traps IPv4 packets that need to be routed and include the Router Alert
+       option. Such packets need to be locally delivered to raw sockets that
+       have the IP_ROUTER_ALERT socket option set
+   * - ``ipv6_router_alert``
+     - ``control``
+     - Traps IPv6 packets that need to be routed and include the Router Alert
+       option in their Hop-by-Hop extension header. Such packets need to be
+       locally delivered to raw sockets that have the IPV6_ROUTER_ALERT socket
+       option set
+   * - ``ptp_event``
+     - ``control``
+     - Traps PTP time-critical event messages (Sync, Delay_req, Pdelay_Req and
+       Pdelay_Resp)
+   * - ``ptp_general``
+     - ``control``
+     - Traps PTP general messages (Announce, Follow_Up, Delay_Resp,
+       Pdelay_Resp_Follow_Up, management and signaling)
 
 Driver-specific Packet Traps
 ============================
@@ -344,6 +453,40 @@ narrow. The description of these groups must be added to the following table:
    * - ``mc_snooping``
      - Contains packet traps for IGMP and MLD packets required for multicast
        snooping
+   * - ``dhcp``
+     - Contains packet traps for DHCP packets
+   * - ``neigh_discovery``
+     - Contains packet traps for neighbour discovery packets (e.g., ARP, IPv6
+       ND)
+   * - ``bfd``
+     - Contains packet traps for BFD packets
+   * - ``ospf``
+     - Contains packet traps for OSPF packets
+   * - ``bgp``
+     - Contains packet traps for BGP packets
+   * - ``vrrp``
+     - Contains packet traps for VRRP packets
+   * - ``pim``
+     - Contains packet traps for PIM packets
+   * - ``uc_loopback``
+     - Contains a packet trap for unicast loopback packets (i.e.,
+       ``uc_loopback``). This trap is singled-out because in cases such as
+       one-armed router it will be constantly triggered. To limit the impact on
+       the CPU usage, a packet trap policer with a low rate can be bound to the
+       group without affecting other traps
+   * - ``local_delivery``
+     - Contains packet traps for packets that should be locally delivered after
+       routing, but do not match more specific packet traps (e.g.,
+       ``ipv4_bgp``)
+   * - ``ipv6``
+     - Contains packet traps for various IPv6 control packets (e.g., Router
+       Advertisements)
+   * - ``ptp_event``
+     - Contains packet traps for PTP time-critical event messages (Sync,
+       Delay_req, Pdelay_Req and Pdelay_Resp)
+   * - ``ptp_general``
+     - Contains packet traps for PTP general messages (Announce, Follow_Up,
+       Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling)
 
 Packet Trap Policers
 ====================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index c0061542ad65..05a45dea976b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -657,6 +657,36 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT,
 	DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT,
 	DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_DHCP,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_DHCP,
+	DEVLINK_TRAP_GENERIC_ID_ARP_REQUEST,
+	DEVLINK_TRAP_GENERIC_ID_ARP_RESPONSE,
+	DEVLINK_TRAP_GENERIC_ID_ARP_OVERLAY,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_SOLICIT,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_ADVERT,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_BFD,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_BFD,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_OSPF,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_OSPF,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_BGP,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_BGP,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_VRRP,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_VRRP,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_PIM,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_PIM,
+	DEVLINK_TRAP_GENERIC_ID_UC_LB,
+	DEVLINK_TRAP_GENERIC_ID_LOCAL_ROUTE,
+	DEVLINK_TRAP_GENERIC_ID_EXTERNAL_ROUTE,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_UC_DIP_LINK_LOCAL_SCOPE,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_NODES,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_ROUTERS,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_SOLICIT,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ADVERT,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_REDIRECT,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_ROUTER_ALERT,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT,
+	DEVLINK_TRAP_GENERIC_ID_PTP_EVENT,
+	DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -677,6 +707,18 @@ enum devlink_trap_group_generic_id {
 	DEVLINK_TRAP_GROUP_GENERIC_ID_LACP,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_DHCP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_NEIGH_DISCOVERY,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_BFD,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_OSPF,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_BGP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_VRRP,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_PIM,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_UC_LB,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_LOCAL_DELIVERY,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL,
 
 	/* Add new generic trap group IDs above */
 	__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX,
@@ -766,6 +808,66 @@ enum devlink_trap_group_generic_id {
 	"mld_v2_report"
 #define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \
 	"mld_v1_done"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_DHCP \
+	"ipv4_dhcp"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DHCP \
+	"ipv6_dhcp"
+#define DEVLINK_TRAP_GENERIC_NAME_ARP_REQUEST \
+	"arp_request"
+#define DEVLINK_TRAP_GENERIC_NAME_ARP_RESPONSE \
+	"arp_response"
+#define DEVLINK_TRAP_GENERIC_NAME_ARP_OVERLAY \
+	"arp_overlay"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_SOLICIT \
+	"ipv6_neigh_solicit"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_ADVERT \
+	"ipv6_neigh_advert"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BFD \
+	"ipv4_bfd"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BFD \
+	"ipv6_bfd"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_OSPF \
+	"ipv4_ospf"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_OSPF \
+	"ipv6_ospf"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BGP \
+	"ipv4_bgp"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BGP \
+	"ipv6_bgp"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_VRRP \
+	"ipv4_vrrp"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_VRRP \
+	"ipv6_vrrp"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_PIM \
+	"ipv4_pim"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_PIM \
+	"ipv6_pim"
+#define DEVLINK_TRAP_GENERIC_NAME_UC_LB \
+	"uc_loopback"
+#define DEVLINK_TRAP_GENERIC_NAME_LOCAL_ROUTE \
+	"local_route"
+#define DEVLINK_TRAP_GENERIC_NAME_EXTERNAL_ROUTE \
+	"external_route"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_UC_DIP_LINK_LOCAL_SCOPE \
+	"ipv6_uc_dip_link_local_scope"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_NODES \
+	"ipv6_dip_all_nodes"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_ROUTERS \
+	"ipv6_dip_all_routers"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_SOLICIT \
+	"ipv6_router_solicit"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ADVERT \
+	"ipv6_router_advert"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_REDIRECT \
+	"ipv6_redirect"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_ROUTER_ALERT \
+	"ipv4_router_alert"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ALERT \
+	"ipv6_router_alert"
+#define DEVLINK_TRAP_GENERIC_NAME_PTP_EVENT \
+	"ptp_event"
+#define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \
+	"ptp_general"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
@@ -787,6 +889,30 @@ enum devlink_trap_group_generic_id {
 	"lldp"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING  \
 	"mc_snooping"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_DHCP \
+	"dhcp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_NEIGH_DISCOVERY \
+	"neigh_discovery"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BFD \
+	"bfd"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_OSPF \
+	"ospf"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BGP \
+	"bgp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_VRRP \
+	"vrrp"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PIM \
+	"pim"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_UC_LB \
+	"uc_loopback"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LOCAL_DELIVERY \
+	"local_delivery"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_IPV6 \
+	"ipv6"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_EVENT \
+	"ptp_event"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \
+	"ptp_general"
 
 #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id,	      \
 			     _metadata_cap)				      \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index c91ef1b5f738..f32854c3d0e7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8507,6 +8507,36 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
 	DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
 	DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
+	DEVLINK_TRAP(IPV4_DHCP, CONTROL),
+	DEVLINK_TRAP(IPV6_DHCP, CONTROL),
+	DEVLINK_TRAP(ARP_REQUEST, CONTROL),
+	DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
+	DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
+	DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
+	DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
+	DEVLINK_TRAP(IPV4_BFD, CONTROL),
+	DEVLINK_TRAP(IPV6_BFD, CONTROL),
+	DEVLINK_TRAP(IPV4_OSPF, CONTROL),
+	DEVLINK_TRAP(IPV6_OSPF, CONTROL),
+	DEVLINK_TRAP(IPV4_BGP, CONTROL),
+	DEVLINK_TRAP(IPV6_BGP, CONTROL),
+	DEVLINK_TRAP(IPV4_VRRP, CONTROL),
+	DEVLINK_TRAP(IPV6_VRRP, CONTROL),
+	DEVLINK_TRAP(IPV4_PIM, CONTROL),
+	DEVLINK_TRAP(IPV6_PIM, CONTROL),
+	DEVLINK_TRAP(UC_LB, CONTROL),
+	DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
+	DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
+	DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
+	DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
+	DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
+	DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
+	DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
+	DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
+	DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
+	DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
+	DEVLINK_TRAP(PTP_EVENT, CONTROL),
+	DEVLINK_TRAP(PTP_GENERAL, CONTROL),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
@@ -8526,6 +8556,18 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
 	DEVLINK_TRAP_GROUP(LACP),
 	DEVLINK_TRAP_GROUP(LLDP),
 	DEVLINK_TRAP_GROUP(MC_SNOOPING),
+	DEVLINK_TRAP_GROUP(DHCP),
+	DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
+	DEVLINK_TRAP_GROUP(BFD),
+	DEVLINK_TRAP_GROUP(OSPF),
+	DEVLINK_TRAP_GROUP(BGP),
+	DEVLINK_TRAP_GROUP(VRRP),
+	DEVLINK_TRAP_GROUP(PIM),
+	DEVLINK_TRAP_GROUP(UC_LB),
+	DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
+	DEVLINK_TRAP_GROUP(IPV6),
+	DEVLINK_TRAP_GROUP(PTP_EVENT),
+	DEVLINK_TRAP_GROUP(PTP_GENERAL),
 };
 
 static int devlink_trap_generic_verify(const struct devlink_trap *trap)
-- 
cgit v1.2.3


From 5eb18a2b6c11bf165271644ef1ab812b10659c8f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 29 May 2020 21:36:43 +0300
Subject: devlink: Add ACL control packet traps

Add packet traps for packets that are sampled / trapped by ACLs, so that
capable drivers could register them with devlink. Add documentation for
every added packet trap and packet trap group.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-trap.rst | 14 ++++++++++++++
 include/net/devlink.h                             | 12 ++++++++++++
 net/core/devlink.c                                |  4 ++++
 3 files changed, 30 insertions(+)

(limited to 'net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 621b634b16be..1e3f3ffee248 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -397,6 +397,14 @@ be added to the following table:
      - ``control``
      - Traps PTP general messages (Announce, Follow_Up, Delay_Resp,
        Pdelay_Resp_Follow_Up, management and signaling)
+   * - ``flow_action_sample``
+     - ``control``
+     - Traps packets sampled during processing of flow action sample (e.g., via
+       tc's sample action)
+   * - ``flow_action_trap``
+     - ``control``
+     - Traps packets logged during processing of flow action trap (e.g., via
+       tc's trap action)
 
 Driver-specific Packet Traps
 ============================
@@ -487,6 +495,12 @@ narrow. The description of these groups must be added to the following table:
    * - ``ptp_general``
      - Contains packet traps for PTP general messages (Announce, Follow_Up,
        Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling)
+   * - ``acl_sample``
+     - Contains packet traps for packets that were sampled by the device during
+       ACL processing
+   * - ``acl_trap``
+     - Contains packet traps for packets that were trapped (logged) by the
+       device during ACL processing
 
 Packet Trap Policers
 ====================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 05a45dea976b..1df6dfec26c2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -687,6 +687,8 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT,
 	DEVLINK_TRAP_GENERIC_ID_PTP_EVENT,
 	DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL,
+	DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_SAMPLE,
+	DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_TRAP,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -719,6 +721,8 @@ enum devlink_trap_group_generic_id {
 	DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP,
 
 	/* Add new generic trap group IDs above */
 	__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX,
@@ -868,6 +872,10 @@ enum devlink_trap_group_generic_id {
 	"ptp_event"
 #define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \
 	"ptp_general"
+#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_SAMPLE \
+	"flow_action_sample"
+#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_TRAP \
+	"flow_action_trap"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
@@ -913,6 +921,10 @@ enum devlink_trap_group_generic_id {
 	"ptp_event"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \
 	"ptp_general"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_SAMPLE \
+	"acl_sample"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_TRAP \
+	"acl_trap"
 
 #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id,	      \
 			     _metadata_cap)				      \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f32854c3d0e7..2cafbc808b09 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -8537,6 +8537,8 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
 	DEVLINK_TRAP(PTP_EVENT, CONTROL),
 	DEVLINK_TRAP(PTP_GENERAL, CONTROL),
+	DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
+	DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
@@ -8568,6 +8570,8 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
 	DEVLINK_TRAP_GROUP(IPV6),
 	DEVLINK_TRAP_GROUP(PTP_EVENT),
 	DEVLINK_TRAP_GROUP(PTP_GENERAL),
+	DEVLINK_TRAP_GROUP(ACL_SAMPLE),
+	DEVLINK_TRAP_GROUP(ACL_TRAP),
 };
 
 static int devlink_trap_generic_verify(const struct devlink_trap *trap)
-- 
cgit v1.2.3


From 0af413bd3e2de73bcf0742ed556be4af83c71964 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 29 May 2020 22:13:58 +0200
Subject: flow_dissector: work around stack frame size warning

The fl_flow_key structure is around 500 bytes, so having two of them
on the stack in one function now exceeds the warning limit after an
otherwise correct change:

net/sched/cls_flower.c:298:12: error: stack frame size of 1056 bytes in function 'fl_classify' [-Werror,-Wframe-larger-than=]

I suspect the fl_classify function could be reworked to only have one
of them on the stack and modify it in place, but I could not work out
how to do that.

As a somewhat hacky workaround, move one of them into an out-of-line
function to reduce its scope. This does not necessarily reduce the stack
usage of the outer function, but at least the second copy is removed
from the stack during most of it and does not add up to whatever is
called from there.

I now see 552 bytes of stack usage for fl_classify(), plus 528 bytes
for fl_mask_lookup().

Fixes: 58cff782cc55 ("flow_dissector: Parse multiple MPLS Label Stack Entries")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 96f5999281e0..030896eadd11 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -272,14 +272,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
 	return NULL;
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-				       struct fl_flow_key *mkey,
-				       struct fl_flow_key *key)
+static noinline_for_stack
+struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key)
 {
+	struct fl_flow_key mkey;
+
+	fl_set_masked_key(&mkey, key, mask);
 	if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
-		return fl_lookup_range(mask, mkey, key);
+		return fl_lookup_range(mask, &mkey, key);
 
-	return __fl_lookup(mask, mkey);
+	return __fl_lookup(mask, &mkey);
 }
 
 static u16 fl_ct_info_to_flower_map[] = {
@@ -299,7 +301,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	struct fl_flow_key skb_mkey;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
 	struct cls_fl_filter *f;
@@ -319,9 +320,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 				    ARRAY_SIZE(fl_ct_info_to_flower_map));
 		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
 
-		fl_set_masked_key(&skb_mkey, &skb_key, mask);
-
-		f = fl_lookup(mask, &skb_mkey, &skb_key);
+		f = fl_mask_lookup(mask, &skb_key);
 		if (f && !tc_skip_sw(f->flags)) {
 			*res = f->res;
 			return tcf_exts_exec(skb, &f->exts, res);
-- 
cgit v1.2.3


From 4b3a61b030d1131dcf3633a276158a3d0a435a47 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sat, 30 May 2020 18:09:47 +0000
Subject: bridge: mrp: Set the priority of MRP instance

Each MRP instance has a priority, a lower value means a higher priority.
The priority of MRP instance is stored in MRP_Test frame in this way
all the MRP nodes in the ring can see other nodes priority.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h        | 1 +
 include/uapi/linux/if_bridge.h | 2 ++
 net/bridge/br_mrp.c            | 3 ++-
 net/bridge/br_mrp_netlink.c    | 5 +++++
 net/bridge/br_mrp_switchdev.c  | 1 +
 net/bridge/br_private_mrp.h    | 1 +
 6 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index db519957e134..f82ef4c45f5e 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -116,6 +116,7 @@ struct switchdev_obj_mrp {
 	struct net_device *p_port;
 	struct net_device *s_port;
 	u32 ring_id;
+	u16 prio;
 };
 
 #define SWITCHDEV_OBJ_MRP(OBJ) \
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 5a43eb86c93b..0162c1370ecb 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -176,6 +176,7 @@ enum {
 	IFLA_BRIDGE_MRP_INSTANCE_RING_ID,
 	IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX,
 	IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX,
+	IFLA_BRIDGE_MRP_INSTANCE_PRIO,
 	__IFLA_BRIDGE_MRP_INSTANCE_MAX,
 };
 
@@ -230,6 +231,7 @@ struct br_mrp_instance {
 	__u32 ring_id;
 	__u32 p_ifindex;
 	__u32 s_ifindex;
+	__u16 prio;
 };
 
 struct br_mrp_ring_state {
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index 8ea59504ef47..f8fd037219fe 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -147,7 +147,7 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp,
 	br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr));
 	hdr = skb_put(skb, sizeof(*hdr));
 
-	hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO);
+	hdr->prio = cpu_to_be16(mrp->prio);
 	ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
 	hdr->port_role = cpu_to_be16(port_role);
 	hdr->state = cpu_to_be16(mrp->ring_state);
@@ -290,6 +290,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
 		return -ENOMEM;
 
 	mrp->ring_id = instance->ring_id;
+	mrp->prio = instance->prio;
 
 	p = br_mrp_get_port(br, instance->p_ifindex);
 	spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index d9de780d2ce0..8cb67d9ca44e 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -22,6 +22,7 @@ br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = {
 	[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]	= { .type = NLA_U32 },
 	[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]	= { .type = NLA_U32 },
 	[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_INSTANCE_PRIO]		= { .type = NLA_U16 },
 };
 
 static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr,
@@ -49,6 +50,10 @@ static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr,
 	inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]);
 	inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]);
 	inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]);
+	inst.prio = MRP_DEFAULT_PRIO;
+
+	if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO])
+		inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]);
 
 	if (cmd == RTM_SETLINK)
 		return br_mrp_add(br, &inst);
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
index 51cb1d5a24b4..3a776043bf80 100644
--- a/net/bridge/br_mrp_switchdev.c
+++ b/net/bridge/br_mrp_switchdev.c
@@ -12,6 +12,7 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp)
 		.p_port = rtnl_dereference(mrp->p_port)->dev,
 		.s_port = rtnl_dereference(mrp->s_port)->dev,
 		.ring_id = mrp->ring_id,
+		.prio = mrp->prio,
 	};
 	int err;
 
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
index a0f53cc3ab85..558941ce2366 100644
--- a/net/bridge/br_private_mrp.h
+++ b/net/bridge/br_private_mrp.h
@@ -14,6 +14,7 @@ struct br_mrp {
 	struct net_bridge_port __rcu	*s_port;
 
 	u32				ring_id;
+	u16				prio;
 
 	enum br_mrp_ring_role_type	ring_role;
 	u8				ring_role_offloaded;
-- 
cgit v1.2.3


From c6676e7d62cfb5cb7c1c5320a26f3634a11afdb0 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sat, 30 May 2020 18:09:48 +0000
Subject: bridge: mrp: Add support for role MRA

A node that has the MRA role, it can behave as MRM or MRC.

Initially it starts as MRM and sends MRP_Test frames on both ring ports.
If it detects that there are MRP_Test send by another MRM, then it
checks if these frames have a lower priority than itself. In this case
it would send MRP_Nack frames to notify the other node that it needs to
stop sending MRP_Test frames.
If it receives a MRP_Nack frame then it stops sending MRP_Test frames
and starts to behave as a MRC but it would continue to monitor the
MRP_Test frames send by MRM. If at a point the MRM stops to send
MRP_Test frames it would get the MRM role and start to send MRP_Test
frames.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h         |   1 +
 include/uapi/linux/if_bridge.h  |   2 +
 include/uapi/linux/mrp_bridge.h |  38 ++++++++++++
 net/bridge/br_mrp.c             | 125 ++++++++++++++++++++++++++++++++++------
 net/bridge/br_mrp_netlink.c     |   6 ++
 net/bridge/br_mrp_switchdev.c   |   4 +-
 net/bridge/br_private_mrp.h     |   4 +-
 7 files changed, 159 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index f82ef4c45f5e..b8c059b4e06d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -130,6 +130,7 @@ struct switchdev_obj_ring_test_mrp {
 	u8 max_miss;
 	u32 ring_id;
 	u32 period;
+	bool monitor;
 };
 
 #define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 0162c1370ecb..caa6914a3e53 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -222,6 +222,7 @@ enum {
 	IFLA_BRIDGE_MRP_START_TEST_INTERVAL,
 	IFLA_BRIDGE_MRP_START_TEST_MAX_MISS,
 	IFLA_BRIDGE_MRP_START_TEST_PERIOD,
+	IFLA_BRIDGE_MRP_START_TEST_MONITOR,
 	__IFLA_BRIDGE_MRP_START_TEST_MAX,
 };
 
@@ -249,6 +250,7 @@ struct br_mrp_start_test {
 	__u32 interval;
 	__u32 max_miss;
 	__u32 period;
+	__u32 monitor;
 };
 
 struct bridge_stp_xstats {
diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h
index bcad42128d62..84f15f48a7cb 100644
--- a/include/uapi/linux/mrp_bridge.h
+++ b/include/uapi/linux/mrp_bridge.h
@@ -11,11 +11,14 @@
 #define MRP_DOMAIN_UUID_LENGTH		16
 #define MRP_VERSION			1
 #define MRP_FRAME_PRIO			7
+#define MRP_OUI_LENGTH			3
+#define MRP_MANUFACTURE_DATA_LENGTH	2
 
 enum br_mrp_ring_role_type {
 	BR_MRP_RING_ROLE_DISABLED,
 	BR_MRP_RING_ROLE_MRC,
 	BR_MRP_RING_ROLE_MRM,
+	BR_MRP_RING_ROLE_MRA,
 };
 
 enum br_mrp_ring_state_type {
@@ -43,6 +46,13 @@ enum br_mrp_tlv_header_type {
 	BR_MRP_TLV_HEADER_RING_TOPO = 0x3,
 	BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4,
 	BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5,
+	BR_MRP_TLV_HEADER_OPTION = 0x7f,
+};
+
+enum br_mrp_sub_tlv_header_type {
+	BR_MRP_SUB_TLV_HEADER_TEST_MGR_NACK = 0x1,
+	BR_MRP_SUB_TLV_HEADER_TEST_PROPAGATE = 0x2,
+	BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR = 0x3,
 };
 
 struct br_mrp_tlv_hdr {
@@ -50,6 +60,11 @@ struct br_mrp_tlv_hdr {
 	__u8 length;
 };
 
+struct br_mrp_sub_tlv_hdr {
+	__u8 type;
+	__u8 length;
+};
+
 struct br_mrp_end_hdr {
 	struct br_mrp_tlv_hdr hdr;
 };
@@ -81,4 +96,27 @@ struct br_mrp_ring_link_hdr {
 	__be16 blocked;
 };
 
+struct br_mrp_sub_opt_hdr {
+	__u8 type;
+	__u8 manufacture_data[MRP_MANUFACTURE_DATA_LENGTH];
+};
+
+struct br_mrp_test_mgr_nack_hdr {
+	__be16 prio;
+	__u8 sa[ETH_ALEN];
+	__be16 other_prio;
+	__u8 other_sa[ETH_ALEN];
+};
+
+struct br_mrp_test_prop_hdr {
+	__be16 prio;
+	__u8 sa[ETH_ALEN];
+	__be16 other_prio;
+	__u8 other_sa[ETH_ALEN];
+};
+
+struct br_mrp_oui_hdr {
+	__u8 oui[MRP_OUI_LENGTH];
+};
+
 #endif
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index f8fd037219fe..24986ec7d38c 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -160,6 +160,16 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp,
 	return skb;
 }
 
+/* This function is continuously called in the following cases:
+ * - when node role is MRM, in this case test_monitor is always set to false
+ *   because it needs to notify the userspace that the ring is open and needs to
+ *   send MRP_Test frames
+ * - when node role is MRA, there are 2 subcases:
+ *     - when MRA behaves as MRM, in this case is similar with MRM role
+ *     - when MRA behaves as MRC, in this case test_monitor is set to true,
+ *       because it needs to detect when it stops seeing MRP_Test frames
+ *       from MRM node but it doesn't need to send MRP_Test frames.
+ */
 static void br_mrp_test_work_expired(struct work_struct *work)
 {
 	struct delayed_work *del_work = to_delayed_work(work);
@@ -177,8 +187,14 @@ static void br_mrp_test_work_expired(struct work_struct *work)
 		/* Notify that the ring is open only if the ring state is
 		 * closed, otherwise it would continue to notify at every
 		 * interval.
+		 * Also notify that the ring is open when the node has the
+		 * role MRA and behaves as MRC. The reason is that the
+		 * userspace needs to know when the MRM stopped sending
+		 * MRP_Test frames so that the current node to try to take
+		 * the role of a MRM.
 		 */
-		if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED)
+		if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED ||
+		    mrp->test_monitor)
 			notify_open = true;
 	}
 
@@ -186,12 +202,15 @@ static void br_mrp_test_work_expired(struct work_struct *work)
 
 	p = rcu_dereference(mrp->p_port);
 	if (p) {
-		skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY);
-		if (!skb)
-			goto out;
-
-		skb_reset_network_header(skb);
-		dev_queue_xmit(skb);
+		if (!mrp->test_monitor) {
+			skb = br_mrp_alloc_test_skb(mrp, p,
+						    BR_MRP_PORT_ROLE_PRIMARY);
+			if (!skb)
+				goto out;
+
+			skb_reset_network_header(skb);
+			dev_queue_xmit(skb);
+		}
 
 		if (notify_open && !mrp->ring_role_offloaded)
 			br_mrp_port_open(p->dev, true);
@@ -199,12 +218,15 @@ static void br_mrp_test_work_expired(struct work_struct *work)
 
 	p = rcu_dereference(mrp->s_port);
 	if (p) {
-		skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY);
-		if (!skb)
-			goto out;
-
-		skb_reset_network_header(skb);
-		dev_queue_xmit(skb);
+		if (!mrp->test_monitor) {
+			skb = br_mrp_alloc_test_skb(mrp, p,
+						    BR_MRP_PORT_ROLE_SECONDARY);
+			if (!skb)
+				goto out;
+
+			skb_reset_network_header(skb);
+			dev_queue_xmit(skb);
+		}
 
 		if (notify_open && !mrp->ring_role_offloaded)
 			br_mrp_port_open(p->dev, true);
@@ -227,7 +249,7 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
 
 	/* Stop sending MRP_Test frames */
 	cancel_delayed_work_sync(&mrp->test_work);
-	br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0);
+	br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0);
 
 	br_mrp_switchdev_del(br, mrp);
 
@@ -452,8 +474,8 @@ int br_mrp_set_ring_role(struct net_bridge *br,
 	return 0;
 }
 
-/* Start to generate MRP test frames, the frames are generated by HW and if it
- * fails, they are generated by the SW.
+/* Start to generate or monitor MRP test frames, the frames are generated by
+ * HW and if it fails, they are generated by the SW.
  * note: already called with rtnl_lock
  */
 int br_mrp_start_test(struct net_bridge *br,
@@ -464,16 +486,18 @@ int br_mrp_start_test(struct net_bridge *br,
 	if (!mrp)
 		return -EINVAL;
 
-	/* Try to push it to the HW and if it fails then continue to generate in
-	 * SW and if that also fails then return error
+	/* Try to push it to the HW and if it fails then continue with SW
+	 * implementation and if that also fails then return error.
 	 */
 	if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval,
-					     test->max_miss, test->period))
+					     test->max_miss, test->period,
+					     test->monitor))
 		return 0;
 
 	mrp->test_interval = test->interval;
 	mrp->test_end = jiffies + usecs_to_jiffies(test->period);
 	mrp->test_max_miss = test->max_miss;
+	mrp->test_monitor = test->monitor;
 	mrp->test_count_miss = 0;
 	queue_delayed_work(system_wq, &mrp->test_work,
 			   usecs_to_jiffies(test->interval));
@@ -510,6 +534,57 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port,
 		br_mrp_port_open(port->dev, false);
 }
 
+/* Determin if the test hdr has a better priority than the node */
+static bool br_mrp_test_better_than_own(struct br_mrp *mrp,
+					struct net_bridge *br,
+					const struct br_mrp_ring_test_hdr *hdr)
+{
+	u16 prio = be16_to_cpu(hdr->prio);
+
+	if (prio < mrp->prio ||
+	    (prio == mrp->prio &&
+	    ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr)))
+		return true;
+
+	return false;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br,
+			       struct net_bridge_port *port,
+			       struct sk_buff *skb)
+{
+	const struct br_mrp_ring_test_hdr *test_hdr;
+	struct br_mrp_ring_test_hdr _test_hdr;
+	const struct br_mrp_tlv_hdr *hdr;
+	struct br_mrp_tlv_hdr _hdr;
+
+	/* Each MRP header starts with a version field which is 16 bits.
+	 * Therefore skip the version and get directly the TLV header.
+	 */
+	hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+	if (!hdr)
+		return;
+
+	if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+		return;
+
+	test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr),
+				      sizeof(_test_hdr), &_test_hdr);
+	if (!test_hdr)
+		return;
+
+	/* Only frames that have a better priority than the node will
+	 * clear the miss counter because otherwise the node will need to behave
+	 * as MRM.
+	 */
+	if (br_mrp_test_better_than_own(mrp, br, test_hdr))
+		mrp->test_count_miss = 0;
+}
+
 /* This will just forward the frame to the other mrp ring port(MRC role) or will
  * not do anything.
  * note: already called with rcu_read_lock
@@ -546,6 +621,18 @@ static int br_mrp_rcv(struct net_bridge_port *p,
 		return 1;
 	}
 
+	/* If the role is MRA then don't forward the frames if it behaves as
+	 * MRM node
+	 */
+	if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) {
+		if (!mrp->test_monitor) {
+			br_mrp_mrm_process(mrp, p, skb);
+			return 1;
+		}
+
+		br_mrp_mra_process(mrp, br, p, skb);
+	}
+
 	/* Clone the frame and forward it on the other MRP port */
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (!nskb)
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
index 8cb67d9ca44e..34b3a8776991 100644
--- a/net/bridge/br_mrp_netlink.c
+++ b/net/bridge/br_mrp_netlink.c
@@ -196,6 +196,7 @@ br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = {
 	[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]	= { .type = NLA_U32 },
 	[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]	= { .type = NLA_U32 },
 	[IFLA_BRIDGE_MRP_START_TEST_PERIOD]	= { .type = NLA_U32 },
+	[IFLA_BRIDGE_MRP_START_TEST_MONITOR]	= { .type = NLA_U32 },
 };
 
 static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr,
@@ -225,6 +226,11 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr,
 	test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]);
 	test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]);
 	test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]);
+	test.monitor = false;
+
+	if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR])
+		test.monitor =
+			nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]);
 
 	return br_mrp_start_test(br, &test);
 }
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
index 3a776043bf80..0da68a0da4b5 100644
--- a/net/bridge/br_mrp_switchdev.c
+++ b/net/bridge/br_mrp_switchdev.c
@@ -65,7 +65,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br,
 
 int br_mrp_switchdev_send_ring_test(struct net_bridge *br,
 				    struct br_mrp *mrp, u32 interval,
-				    u8 max_miss, u32 period)
+				    u8 max_miss, u32 period,
+				    bool monitor)
 {
 	struct switchdev_obj_ring_test_mrp test = {
 		.obj.orig_dev = br->dev,
@@ -74,6 +75,7 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br,
 		.max_miss = max_miss,
 		.ring_id = mrp->ring_id,
 		.period = period,
+		.monitor = monitor,
 	};
 	int err;
 
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
index 558941ce2366..33b255e38ffe 100644
--- a/net/bridge/br_private_mrp.h
+++ b/net/bridge/br_private_mrp.h
@@ -26,6 +26,7 @@ struct br_mrp {
 	unsigned long			test_end;
 	u32				test_count_miss;
 	u32				test_max_miss;
+	bool				test_monitor;
 
 	u32				seq_id;
 
@@ -52,7 +53,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
 int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
 				    enum br_mrp_ring_state_type state);
 int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
-				    u32 interval, u8 max_miss, u32 period);
+				    u32 interval, u8 max_miss, u32 period,
+				    bool monitor);
 int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
 				    enum br_mrp_port_state_type state);
 int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
-- 
cgit v1.2.3


From 4e4f4ce6abf5f6a8df0561776d3a790d60d519d0 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Sat, 30 May 2020 20:49:56 +0200
Subject: cls_flower: remove mpls_opts_policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling with W=1 gives the following warning:
net/sched/cls_flower.c:731:1: warning: ‘mpls_opts_policy’ defined but not used [-Wunused-const-variable=]

The TCA_FLOWER_KEY_MPLS_OPTS contains a list of
TCA_FLOWER_KEY_MPLS_OPTS_LSE. Therefore, the attributes all have the
same type and we can't parse the list with nla_parse*() and have the
attributes validated automatically using an nla_policy.

fl_set_key_mpls_opts() properly verifies that all attributes in the
list are TCA_FLOWER_KEY_MPLS_OPTS_LSE. Then fl_set_key_mpls_lse()
uses nla_parse_nested() on all these attributes, thus verifying that
they have the NLA_F_NESTED flag. So we can safely drop the
mpls_opts_policy.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 030896eadd11..b2da37286082 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -726,11 +726,6 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]       = { .type = NLA_U8 },
 };
 
-static const struct nla_policy
-mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = {
-	[TCA_FLOWER_KEY_MPLS_OPTS_LSE]    = { .type = NLA_NESTED },
-};
-
 static const struct nla_policy
 mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]    = { .type = NLA_U8 },
-- 
cgit v1.2.3


From 547ce4cfb34cdecfa0ee19c29a5510329a7ac802 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 May 2020 02:06:55 +0100
Subject: switch cmsghdr_from_user_compat_to_kern() to copy_from_user()

no point getting compat_cmsghdr field-by-field

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/compat.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/compat.c b/net/compat.c
index afd7b444e0bf..5e3041a2c37d 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -183,20 +183,21 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
 	memset(kcmsg, 0, kcmlen);
 	ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
 	while (ucmsg != NULL) {
-		if (__get_user(ucmlen, &ucmsg->cmsg_len))
+		struct compat_cmsghdr cmsg;
+		if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg)))
 			goto Efault;
-		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+		if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg))
 			goto Einval;
-		tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
+		tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
 		if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
 			goto Einval;
 		kcmsg->cmsg_len = tmp;
+		kcmsg->cmsg_level = cmsg.cmsg_level;
+		kcmsg->cmsg_type = cmsg.cmsg_type;
 		tmp = CMSG_ALIGN(tmp);
-		if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
-		    __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
-		    copy_from_user(CMSG_DATA(kcmsg),
+		if (copy_from_user(CMSG_DATA(kcmsg),
 				   CMSG_COMPAT_DATA(ucmsg),
-				   (ucmlen - sizeof(*ucmsg))))
+				   (cmsg.cmsg_len - sizeof(*ucmsg))))
 			goto Efault;
 
 		/* Advance. */
-- 
cgit v1.2.3


From abe3cac8706bffeda3ebc06e4a9fa6e9cadacf26 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 24 May 2020 09:50:33 -0700
Subject: bpf, sk_msg: Add some generic helpers that may be useful from sk_msg

Add these generic helpers that may be useful to use from sk_msg programs.
The helpers do not depend on ctx so we can simply add them here,

 BPF_FUNC_perf_event_output
 BPF_FUNC_get_current_uid_gid
 BPF_FUNC_get_current_pid_tgid
 BPF_FUNC_get_current_cgroup_id
 BPF_FUNC_get_current_ancestor_cgroup_id
 BPF_FUNC_get_cgroup_classid

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/159033903373.12355.15489763099696629346.stgit@john-Precision-5820-Tower
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index bd2853d23b50..c3b496a19748 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6443,6 +6443,22 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_push_data_proto;
 	case BPF_FUNC_msg_pop_data:
 		return &bpf_msg_pop_data_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_event_output_data_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+	case BPF_FUNC_get_current_ancestor_cgroup_id:
+		return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_curr_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 24 May 2020 09:51:15 -0700
Subject: bpf, sk_msg: Add get socket storage helpers

Add helpers to use local socket storage.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  2 ++
 net/core/filter.c              | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97e1fd19ff58..54b93f8b49b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3645,6 +3645,8 @@ struct sk_msg_md {
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
 	__u32 size;		/* Total size of sk_msg */
+
+	__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
 struct sk_reuseport_md {
diff --git a/net/core/filter.c b/net/core/filter.c
index c3b496a19748..a6fc23447f12 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_pid_tgid:
 		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
 #ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
@@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size,
 		if (size != sizeof(__u64))
 			return false;
 		break;
+	case offsetof(struct sk_msg_md, sk):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_SOCKET;
+		break;
 	case bpf_ctx_range(struct sk_msg_md, family):
 	case bpf_ctx_range(struct sk_msg_md, remote_ip4):
 	case bpf_ctx_range(struct sk_msg_md, local_ip4):
@@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_msg_sg, size));
 		break;
+
+	case offsetof(struct sk_msg_md, sk):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg, sk));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97e1fd19ff58..54b93f8b49b8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3645,6 +3645,8 @@ struct sk_msg_md {
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
 	__u32 size;		/* Total size of sk_msg */
+
+	__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
 struct sk_reuseport_md {
-- 
cgit v1.2.3


From c3c16f2ea6d20159903cf93afbb1155f3d8348d5 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Tue, 26 May 2020 17:34:36 -0700
Subject: bpf: Add rx_queue_mapping to bpf_sock

Add "rx_queue_mapping" to bpf_sock. This gives read access for the
existing field (sk_rx_queue_mapping) of struct sock from bpf_sock.
Semantics for the bpf_sock rx_queue_mapping access are similar to
sk_rx_queue_get(), i.e the value NO_QUEUE_MAPPING is not allowed
and -1 is returned in that case. This is useful for transmit queue
selection based on the received queue index which is cached in the
socket in the receive path.

v3: Addressed review comments to add usecase in patch description,
    and fixed default value for rx_queue_mapping.
v2: fixed build error for CONFIG_XPS wrapping, reported by
    kbuild test robot <lkp@intel.com>

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 974ca6e948e3..630432c5c292 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3612,6 +3612,7 @@ struct bpf_sock {
 	__u32 dst_ip4;
 	__u32 dst_ip6[4];
 	__u32 state;
+	__s32 rx_queue_mapping;
 };
 
 struct bpf_tcp_sock {
diff --git a/net/core/filter.c b/net/core/filter.c
index a6fc23447f12..0008b029d644 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6849,6 +6849,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 	case offsetof(struct bpf_sock, protocol):
 	case offsetof(struct bpf_sock, dst_port):
 	case offsetof(struct bpf_sock, src_port):
+	case offsetof(struct bpf_sock, rx_queue_mapping):
 	case bpf_ctx_range(struct bpf_sock, src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
 	case bpf_ctx_range(struct bpf_sock, dst_ip4):
@@ -7897,6 +7898,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 						    skc_state),
 				       target_size));
 		break;
+	case offsetof(struct bpf_sock, rx_queue_mapping):
+#ifdef CONFIG_XPS
+		*insn++ = BPF_LDX_MEM(
+			BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
+			si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock, sk_rx_queue_mapping,
+				       sizeof_field(struct sock,
+						    sk_rx_queue_mapping),
+				       target_size));
+		*insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
+				      1);
+		*insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+#else
+		*insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+		*target_size = 2;
+#endif
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Fri, 29 May 2020 16:07:13 -0600
Subject: bpf: Add support to attach bpf program to a devmap entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add BPF_XDP_DEVMAP attach type for use with programs associated with a
DEVMAP entry.

Allow DEVMAPs to associate a program with a device entry by adding
a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program
id, so the fd and id are a union. bpf programs can get access to the
struct via vmlinux.h.

The program associated with the fd must have type XDP with expected
attach type BPF_XDP_DEVMAP. When a program is associated with a device
index, the program is run on an XDP_REDIRECT and before the buffer is
added to the per-cpu queue. At this point rxq data is still valid; the
next patch adds tx device information allowing the prorgam to see both
ingress and egress device indices.

XDP generic is skb based and XDP programs do not work with skb's. Block
the use case by walking maps used by a program that is to be attached
via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with

Block attach of BPF_XDP_DEVMAP programs to devices.

Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  5 +++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/devmap.c            | 88 ++++++++++++++++++++++++++++++++++++++++--
 net/core/dev.c                 | 18 +++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 109 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e5884f7f801c..e042311f991f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
+bool dev_map_can_have_prog(struct bpf_map *map);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_flush(void);
@@ -1363,6 +1364,10 @@ static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map
 {
 	return NULL;
 }
+static inline bool dev_map_can_have_prog(struct bpf_map *map)
+{
+	return false;
+}
 
 static inline void __dev_flush(void)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 630432c5c292..f1e364d69007 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -225,6 +225,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_GETPEERNAME,
 	BPF_CGROUP_INET4_GETSOCKNAME,
 	BPF_CGROUP_INET6_GETSOCKNAME,
+	BPF_XDP_DEVMAP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a1459de0914e..0089d56617ec 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue {
 /* DEVMAP values */
 struct bpf_devmap_val {
 	u32 ifindex;   /* device index */
+	union {
+		int fd;  /* prog fd on map write */
+		u32 id;  /* prog id on map read */
+	} bpf_prog;
 };
 
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
+	struct bpf_prog *xdp_prog;
 	struct rcu_head rcu;
 	unsigned int idx;
 	struct bpf_devmap_val val;
@@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
 
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 {
+	u32 valsize = attr->value_size;
 	u64 cost = 0;
 	int err;
 
-	/* check sanity of attributes */
+	/* check sanity of attributes. 2 value sizes supported:
+	 * 4 bytes: ifindex
+	 * 8 bytes: ifindex + prog fd
+	 */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
+	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return -EINVAL;
 
 	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
@@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map)
 
 			hlist_for_each_entry_safe(dev, next, head, index_hlist) {
 				hlist_del_rcu(&dev->index_hlist);
+				if (dev->xdp_prog)
+					bpf_prog_put(dev->xdp_prog);
 				dev_put(dev->dev);
 				kfree(dev);
 			}
@@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map)
 			if (!dev)
 				continue;
 
+			if (dev->xdp_prog)
+				bpf_prog_put(dev->xdp_prog);
 			dev_put(dev->dev);
 			kfree(dev);
 		}
@@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
 	return -ENOENT;
 }
 
+bool dev_map_can_have_prog(struct bpf_map *map)
+{
+	if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
+	     map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
+	    map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
+		return true;
+
+	return false;
+}
+
 static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = bq->dev;
@@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 	return bq_enqueue(dev, xdpf, dev_rx);
 }
 
+static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
+					 struct xdp_buff *xdp,
+					 struct bpf_prog *xdp_prog)
+{
+	u32 act;
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+		return xdp;
+	case XDP_DROP:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(dev, xdp_prog, act);
+		break;
+	}
+
+	xdp_return_buff(xdp);
+	return NULL;
+}
+
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
@@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 {
 	struct net_device *dev = dst->dev;
 
+	if (dst->xdp_prog) {
+		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
+		if (!xdp)
+			return 0;
+	}
 	return __xdp_enqueue(dev, xdp, dev_rx);
 }
 
@@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 	struct bpf_dtab_netdev *dev;
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+	if (dev->xdp_prog)
+		bpf_prog_put(dev->xdp_prog);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 						    struct bpf_devmap_val *val,
 						    unsigned int idx)
 {
+	struct bpf_prog *prog = NULL;
 	struct bpf_dtab_netdev *dev;
 
 	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
@@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 	if (!dev->dev)
 		goto err_out;
 
+	if (val->bpf_prog.fd >= 0) {
+		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+					     BPF_PROG_TYPE_XDP, false);
+		if (IS_ERR(prog))
+			goto err_put_dev;
+		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+			goto err_put_prog;
+	}
+
 	dev->idx = idx;
 	dev->dtab = dtab;
+	if (prog) {
+		dev->xdp_prog = prog;
+		dev->val.bpf_prog.id = prog->aux->id;
+	} else {
+		dev->xdp_prog = NULL;
+		dev->val.bpf_prog.id = 0;
+	}
 	dev->val.ifindex = val->ifindex;
 
 	return dev;
+err_put_prog:
+	bpf_prog_put(prog);
+err_put_dev:
+	dev_put(dev->dev);
 err_out:
 	kfree(dev);
 	return ERR_PTR(-EINVAL);
@@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 				 void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
 	struct bpf_dtab_netdev *dev, *old_dev;
-	struct bpf_devmap_val val = { };
 	u32 i = *(u32 *)key;
 
 	if (unlikely(map_flags > BPF_EXIST))
@@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 
 	if (!val.ifindex) {
 		dev = NULL;
+		/* can not specify fd if ifindex is 0 */
+		if (val.bpf_prog.fd != -1)
+			return -EINVAL;
 	} else {
 		dev = __dev_map_alloc_node(net, dtab, &val, i);
 		if (IS_ERR(dev))
@@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 				     void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
 	struct bpf_dtab_netdev *dev, *old_dev;
-	struct bpf_devmap_val val = { };
 	u32 idx = *(u32 *)key;
 	unsigned long flags;
 	int err = -EEXIST;
diff --git a/net/core/dev.c b/net/core/dev.c
index ae37586f6ee8..10684833f864 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 	struct bpf_prog *new = xdp->prog;
 	int ret = 0;
 
+	if (new) {
+		u32 i;
+
+		/* generic XDP does not work with DEVMAPs that can
+		 * have a bpf_prog installed on an entry
+		 */
+		for (i = 0; i < new->aux->used_map_cnt; i++) {
+			if (dev_map_can_have_prog(new->aux->used_maps[i]))
+				return -EINVAL;
+		}
+	}
+
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		rcu_assign_pointer(dev->xdp_prog, new);
@@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return -EINVAL;
 		}
 
+		if (prog->expected_attach_type == BPF_XDP_DEVMAP) {
+			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
+
 		/* prog->aux->id may be 0 for orphaned device-bound progs */
 		if (prog->aux->id && prog->aux->id == prog_id) {
 			bpf_prog_put(prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 974ca6e948e3..65d7717bce2f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -225,6 +225,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_GETPEERNAME,
 	BPF_CGROUP_INET4_GETSOCKNAME,
 	BPF_CGROUP_INET6_GETSOCKNAME,
+	BPF_XDP_DEVMAP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Fri, 29 May 2020 16:07:14 -0600
Subject: xdp: Add xdp_txq_info to xdp_buff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the
moment only the device is added. Other fields (queue_index)
can be added as use cases arise.

>From a UAPI perspective, add egress_ifindex to xdp context for
bpf programs to see the Tx device.

Update the verifier to only allow accesses to egress_ifindex by
XDP programs with BPF_XDP_DEVMAP expected attach type.

Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h              |  5 +++++
 include/uapi/linux/bpf.h       |  2 ++
 kernel/bpf/devmap.c            |  3 +++
 net/core/filter.c              | 17 +++++++++++++++++
 tools/include/uapi/linux/bpf.h |  2 ++
 5 files changed, 29 insertions(+)

(limited to 'net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 90f11760bd12..d54022959491 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -61,12 +61,17 @@ struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_txq_info {
+	struct net_device *dev;
+};
+
 struct xdp_buff {
 	void *data;
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
 	struct xdp_rxq_info *rxq;
+	struct xdp_txq_info *txq;
 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f1e364d69007..f862a58fb567 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3707,6 +3707,8 @@ struct xdp_md {
 	/* Below access go through struct xdp_rxq_info */
 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 	__u32 rx_queue_index;  /* rxq->queue_index  */
+
+	__u32 egress_ifindex;  /* txq->dev->ifindex */
 };
 
 enum sk_action {
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 0089d56617ec..c04fb1c72f5e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
 					 struct xdp_buff *xdp,
 					 struct bpf_prog *xdp_prog)
 {
+	struct xdp_txq_info txq = { .dev = dev };
 	u32 act;
 
+	xdp->txq = &txq;
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
diff --git a/net/core/filter.c b/net/core/filter.c
index 0008b029d644..85ff827aab73 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size,
 				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info)
 {
+	if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
+		switch (off) {
+		case offsetof(struct xdp_md, egress_ifindex):
+			return false;
+		}
+	}
+
 	if (type == BPF_WRITE) {
 		if (bpf_prog_is_dev_bound(prog->aux)) {
 			switch (off) {
@@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct xdp_rxq_info,
 					       queue_index));
 		break;
+	case offsetof(struct xdp_md, egress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, txq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_txq_info, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct net_device, ifindex));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 65d7717bce2f..f74bc4a2385e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3706,6 +3706,8 @@ struct xdp_md {
 	/* Below access go through struct xdp_rxq_info */
 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 	__u32 rx_queue_index;  /* rxq->queue_index  */
+
+	__u32 egress_ifindex;  /* txq->dev->ifindex */
 };
 
 enum sk_action {
-- 
cgit v1.2.3


From ca2f5f21dbbd5e3a00cd3e97f728aa2ca0b2e011 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 29 May 2020 16:06:41 -0700
Subject: bpf: Refactor sockmap redirect code so its easy to reuse

We will need this block of code called from tls context shortly
lets refactor the redirect logic so its easy to use. This also
cleans up the switch stmt so we have fewer fallthrough cases.

No logic changes are intended.

Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/159079360110.5745.7024009076049029819.stgit@john-Precision-5820-Tower
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/skmsg.c | 55 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index c479372f2cd2..9d72f71e9b47 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
 	return container_of(parser, struct sk_psock, parser);
 }
 
-static void sk_psock_verdict_apply(struct sk_psock *psock,
-				   struct sk_buff *skb, int verdict)
+static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
 {
 	struct sk_psock *psock_other;
 	struct sock *sk_other;
 	bool ingress;
 
+	sk_other = tcp_skb_bpf_redirect_fetch(skb);
+	if (unlikely(!sk_other)) {
+		kfree_skb(skb);
+		return;
+	}
+	psock_other = sk_psock(sk_other);
+	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	ingress = tcp_skb_bpf_ingress(skb);
+	if ((!ingress && sock_writeable(sk_other)) ||
+	    (ingress &&
+	     atomic_read(&sk_other->sk_rmem_alloc) <=
+	     sk_other->sk_rcvbuf)) {
+		if (!ingress)
+			skb_set_owner_w(skb, sk_other);
+		skb_queue_tail(&psock_other->ingress_skb, skb);
+		schedule_work(&psock_other->work);
+	} else {
+		kfree_skb(skb);
+	}
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+				   struct sk_buff *skb, int verdict)
+{
+	struct sock *sk_other;
+
 	switch (verdict) {
 	case __SK_PASS:
 		sk_other = psock->sk;
@@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 		}
 		goto out_free;
 	case __SK_REDIRECT:
-		sk_other = tcp_skb_bpf_redirect_fetch(skb);
-		if (unlikely(!sk_other))
-			goto out_free;
-		psock_other = sk_psock(sk_other);
-		if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
-		    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
-			goto out_free;
-		ingress = tcp_skb_bpf_ingress(skb);
-		if ((!ingress && sock_writeable(sk_other)) ||
-		    (ingress &&
-		     atomic_read(&sk_other->sk_rmem_alloc) <=
-		     sk_other->sk_rcvbuf)) {
-			if (!ingress)
-				skb_set_owner_w(skb, sk_other);
-			skb_queue_tail(&psock_other->ingress_skb, skb);
-			schedule_work(&psock_other->work);
-			break;
-		}
-		/* fall-through */
+		sk_psock_skb_redirect(psock, skb);
+		break;
 	case __SK_DROP:
 		/* fall-through */
 	default:
-- 
cgit v1.2.3


From e91de6afa81c10e9f855c5695eb9a53168d96b73 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 29 May 2020 16:06:59 -0700
Subject: bpf: Fix running sk_skb program types with ktls

KTLS uses a stream parser to collect TLS messages and send them to
the upper layer tls receive handler. This ensures the tls receiver
has a full TLS header to parse when it is run. However, when a
socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS
is enabled we end up with two stream parsers running on the same
socket.

The result is both try to run on the same socket. First the KTLS
stream parser runs and calls read_sock() which will tcp_read_sock
which in turn calls tcp_rcv_skb(). This dequeues the skb from the
sk_receive_queue. When this is done KTLS code then data_ready()
callback which because we stacked KTLS on top of the bpf stream
verdict program has been replaced with sk_psock_start_strp(). This
will in turn kick the stream parser again and eventually do the
same thing KTLS did above calling into tcp_rcv_skb() and dequeuing
a skb from the sk_receive_queue.

At this point the data stream is broke. Part of the stream was
handled by the KTLS side some other bytes may have been handled
by the BPF side. Generally this results in either missing data
or more likely a "Bad Message" complaint from the kTLS receive
handler as the BPF program steals some bytes meant to be in a
TLS header and/or the TLS header length is no longer correct.

We've already broke the idealized model where we can stack ULPs
in any order with generic callbacks on the TX side to handle this.
So in this patch we do the same thing but for RX side. We add
a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict
program is running and add a tls_sw_has_ctx_rx() helper so BPF
side can learn there is a TLS ULP on the socket.

Then on BPF side we omit calling our stream parser to avoid
breaking the data stream for the KTLS receiver. Then on the
KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS
receiver is done with the packet but before it posts the
msg to userspace. This gives us symmetry between the TX and
RX halfs and IMO makes it usable again. On the TX side we
process packets in this order BPF -> TLS -> TCP and on
the receive side in the reverse order TCP -> TLS -> BPF.

Discovered while testing OpenSSL 3.0 Alpha2.0 release.

Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skmsg.h |  8 ++++++++
 include/net/tls.h     |  9 +++++++++
 net/core/skmsg.c      | 43 ++++++++++++++++++++++++++++++++++++++++---
 net/tls/tls_sw.c      | 20 ++++++++++++++++++--
 4 files changed, 75 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index ad31c9fb7158..08674cd14d5a 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
 	psock_set_prog(&progs->skb_verdict, NULL);
 }
 
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
+
+static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
+{
+	if (!psock)
+		return false;
+	return psock->parser.enabled;
+}
 #endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index 3e7b44cae0d9..3212d3c214a9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
 	return !!tls_sw_ctx_tx(ctx);
 }
 
+static inline bool tls_sw_has_ctx_rx(const struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (!ctx)
+		return false;
+	return !!tls_sw_ctx_rx(ctx);
+}
+
 void tls_sw_write_space(struct sock *sk, struct tls_context *ctx);
 void tls_device_write_space(struct sock *sk, struct tls_context *ctx);
 
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9d72f71e9b47..351afbf6bfba 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -7,6 +7,7 @@
 
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/tls.h>
 
 static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
 {
@@ -714,6 +715,38 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
 	}
 }
 
+static void sk_psock_tls_verdict_apply(struct sk_psock *psock,
+				       struct sk_buff *skb, int verdict)
+{
+	switch (verdict) {
+	case __SK_REDIRECT:
+		sk_psock_skb_redirect(psock, skb);
+		break;
+	case __SK_PASS:
+	case __SK_DROP:
+	default:
+		break;
+	}
+}
+
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct bpf_prog *prog;
+	int ret = __SK_PASS;
+
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.skb_verdict);
+	if (likely(prog)) {
+		tcp_skb_bpf_redirect_clear(skb);
+		ret = sk_psock_bpf_run(psock, prog, skb);
+		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+	}
+	rcu_read_unlock();
+	sk_psock_tls_verdict_apply(psock, skb, ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
+
 static void sk_psock_verdict_apply(struct sk_psock *psock,
 				   struct sk_buff *skb, int verdict)
 {
@@ -792,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk)
 	rcu_read_lock();
 	psock = sk_psock(sk);
 	if (likely(psock)) {
-		write_lock_bh(&sk->sk_callback_lock);
-		strp_data_ready(&psock->parser.strp);
-		write_unlock_bh(&sk->sk_callback_lock);
+		if (tls_sw_has_ctx_rx(sk)) {
+			psock->parser.saved_data_ready(sk);
+		} else {
+			write_lock_bh(&sk->sk_callback_lock);
+			strp_data_ready(&psock->parser.strp);
+			write_unlock_bh(&sk->sk_callback_lock);
+		}
 	}
 	rcu_read_unlock();
 }
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 8c2763eb6aae..24f64bc0de18 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	long timeo;
 	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
 	bool is_peek = flags & MSG_PEEK;
+	bool bpf_strp_enabled;
 	int num_async = 0;
 	int pending;
 
@@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk,
 
 	psock = sk_psock_get(sk);
 	lock_sock(sk);
+	bpf_strp_enabled = sk_psock_strp_enabled(psock);
 
 	/* Process pending decrypted records. It must be non-zero-copy */
 	err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false,
@@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk,
 
 		if (to_decrypt <= len && !is_kvec && !is_peek &&
 		    ctx->control == TLS_RECORD_TYPE_DATA &&
-		    prot->version != TLS_1_3_VERSION)
+		    prot->version != TLS_1_3_VERSION &&
+		    !bpf_strp_enabled)
 			zc = true;
 
 		/* Do not use async mode if record is non-data */
-		if (ctx->control == TLS_RECORD_TYPE_DATA)
+		if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled)
 			async_capable = ctx->async_capable;
 		else
 			async_capable = false;
@@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk,
 			goto pick_next_record;
 
 		if (!zc) {
+			if (bpf_strp_enabled) {
+				err = sk_psock_tls_strp_read(psock, skb);
+				if (err != __SK_PASS) {
+					rxm->offset = rxm->offset + rxm->full_len;
+					rxm->full_len = 0;
+					if (err == __SK_DROP)
+						consume_skb(skb);
+					ctx->recv_pkt = NULL;
+					__strp_unpause(&ctx->strp);
+					continue;
+				}
+			}
+
 			if (rxm->full_len > len) {
 				retain_skb = true;
 				chunk = len;
-- 
cgit v1.2.3


From 8ea204c2b658eaef55b4716fde469fb66c589a3d Mon Sep 17 00:00:00 2001
From: Ferenc Fejes <fejes@inf.elte.hu>
Date: Sat, 30 May 2020 23:09:00 +0200
Subject: net: Make locking in sock_bindtoindex optional

The sock_bindtoindex intended for kernel wide usage however
it will lock the socket regardless of the context. This modification
relax this behavior optionally: locking the socket will be optional
by calling the sock_bindtoindex with lock_sk = true.

The modification applied to all users of the sock_bindtoindex.

Signed-off-by: Ferenc Fejes <fejes@inf.elte.hu>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/bee6355da40d9e991b2f2d12b67d55ebb5f5b207.1590871065.git.fejes@inf.elte.hu
---
 include/net/sock.h        |  2 +-
 net/core/sock.c           | 10 ++++++----
 net/ipv4/udp_tunnel.c     |  2 +-
 net/ipv6/ip6_udp_tunnel.c |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 6e9f713a7860..c53cc42b5ab9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2690,7 +2690,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 
-int sock_bindtoindex(struct sock *sk, int ifindex);
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
 void sock_set_keepalive(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 61ec573221a6..6c4acf1f0220 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -594,13 +594,15 @@ out:
 	return ret;
 }
 
-int sock_bindtoindex(struct sock *sk, int ifindex)
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 {
 	int ret;
 
-	lock_sock(sk);
+	if (lock_sk)
+		lock_sock(sk);
 	ret = sock_bindtoindex_locked(sk, ifindex);
-	release_sock(sk);
+	if (lock_sk)
+		release_sock(sk);
 
 	return ret;
 }
@@ -646,7 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 			goto out;
 	}
 
-	return sock_bindtoindex(sk, index);
+	return sock_bindtoindex(sk, index, true);
 out:
 #endif
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 2158e8bddf41..3eecba0874aa 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -22,7 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->bind_ifindex) {
-		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
 		if (err < 0)
 			goto error;
 	}
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 2e0ad1bc84a8..cdc4d4ee2420 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -30,7 +30,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 			goto error;
 	}
 	if (cfg->bind_ifindex) {
-		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
 		if (err < 0)
 			goto error;
 	}
-- 
cgit v1.2.3


From 70c58997c1e864c96dfdf072572047303db8f42a Mon Sep 17 00:00:00 2001
From: Ferenc Fejes <fejes@inf.elte.hu>
Date: Sat, 30 May 2020 23:09:01 +0200
Subject: bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt

Extending the supported sockopts in bpf_setsockopt with
SO_BINDTODEVICE. We call sock_bindtoindex with parameter
lock_sk = false in this context because we already owning
the socket.

Signed-off-by: Ferenc Fejes <fejes@inf.elte.hu>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/4149e304867b8d5a606a305bc59e29b063e51f49.1590871065.git.fejes@inf.elte.hu
---
 net/core/filter.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 85ff827aab73..ae82bcb03124 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4248,6 +4248,9 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 			   char *optval, int optlen, u32 flags)
 {
+	char devname[IFNAMSIZ];
+	struct net *net;
+	int ifindex;
 	int ret = 0;
 	int val;
 
@@ -4257,7 +4260,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 	sock_owned_by_me(sk);
 
 	if (level == SOL_SOCKET) {
-		if (optlen != sizeof(int))
+		if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
 			return -EINVAL;
 		val = *((int *)optval);
 
@@ -4298,6 +4301,29 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 				sk_dst_reset(sk);
 			}
 			break;
+		case SO_BINDTODEVICE:
+			ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+			optlen = min_t(long, optlen, IFNAMSIZ - 1);
+			strncpy(devname, optval, optlen);
+			devname[optlen] = 0;
+
+			ifindex = 0;
+			if (devname[0] != '\0') {
+				struct net_device *dev;
+
+				ret = -ENODEV;
+
+				net = sock_net(sk);
+				dev = dev_get_by_name(net, devname);
+				if (!dev)
+					break;
+				ifindex = dev->ifindex;
+				dev_put(dev);
+			}
+			ret = sock_bindtoindex(sk, ifindex, false);
+#endif
+			break;
 		default:
 			ret = -EINVAL;
 		}
-- 
cgit v1.2.3


From 171526f6fee84de0c39e2b7aa7e666ba0bbfd173 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Sun, 31 May 2020 10:28:35 +0200
Subject: flow_dissector: Pull locking up from prog attach callback

Split out the part of attach callback that happens with attach/detach lock
acquired. This structures the prog attach callback in a way that opens up
doors for moving the locking out of flow_dissector and into generic
callbacks for attaching/detaching progs to netns in subsequent patches.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20200531082846.2117903-2-jakub@cloudflare.com
---
 net/core/flow_dissector.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0aeb33572feb..b64a44a083fd 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -109,15 +109,10 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr,
 	return 0;
 }
 
-int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
-				       struct bpf_prog *prog)
+static int flow_dissector_bpf_prog_attach(struct net *net,
+					  struct bpf_prog *prog)
 {
 	struct bpf_prog *attached;
-	struct net *net;
-	int ret = 0;
-
-	net = current->nsproxy->net_ns;
-	mutex_lock(&flow_dissector_mutex);
 
 	if (net == &init_net) {
 		/* BPF flow dissector in the root namespace overrides
@@ -130,33 +125,38 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
 		for_each_net(ns) {
 			if (ns == &init_net)
 				continue;
-			if (rcu_access_pointer(ns->flow_dissector_prog)) {
-				ret = -EEXIST;
-				goto out;
-			}
+			if (rcu_access_pointer(ns->flow_dissector_prog))
+				return -EEXIST;
 		}
 	} else {
 		/* Make sure root flow dissector is not attached
 		 * when attaching to the non-root namespace.
 		 */
-		if (rcu_access_pointer(init_net.flow_dissector_prog)) {
-			ret = -EEXIST;
-			goto out;
-		}
+		if (rcu_access_pointer(init_net.flow_dissector_prog))
+			return -EEXIST;
 	}
 
 	attached = rcu_dereference_protected(net->flow_dissector_prog,
 					     lockdep_is_held(&flow_dissector_mutex));
-	if (attached == prog) {
+	if (attached == prog)
 		/* The same program cannot be attached twice */
-		ret = -EINVAL;
-		goto out;
-	}
+		return -EINVAL;
+
 	rcu_assign_pointer(net->flow_dissector_prog, prog);
 	if (attached)
 		bpf_prog_put(attached);
-out:
+	return 0;
+}
+
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+				       struct bpf_prog *prog)
+{
+	int ret;
+
+	mutex_lock(&flow_dissector_mutex);
+	ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog);
 	mutex_unlock(&flow_dissector_mutex);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Sun, 31 May 2020 10:28:36 +0200
Subject: net: Introduce netns_bpf for BPF programs attached to netns

In order to:

 (1) attach more than one BPF program type to netns, or
 (2) support attaching BPF programs to netns with bpf_link, or
 (3) support multi-prog attach points for netns

we will need to keep more state per netns than a single pointer like we
have now for BPF flow dissector program.

Prepare for the above by extracting netns_bpf that is part of struct net,
for storing all state related to BPF programs attached to netns.

Turn flow dissector callbacks for querying/attaching/detaching a program
into generic ones that operate on netns_bpf. Next patch will move the
generic callbacks into their own module.

This is similar to how it is organized for cgroup with cgroup_bpf.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com
---
 include/linux/bpf-netns.h   |  56 +++++++++++++++++++++++
 include/linux/skbuff.h      |  26 -----------
 include/net/net_namespace.h |   4 +-
 include/net/netns/bpf.h     |  17 +++++++
 kernel/bpf/syscall.c        |   7 +--
 net/core/flow_dissector.c   | 105 +++++++++++++++++++++++++++++---------------
 6 files changed, 149 insertions(+), 66 deletions(-)
 create mode 100644 include/linux/bpf-netns.h
 create mode 100644 include/net/netns/bpf.h

(limited to 'net')

diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h
new file mode 100644
index 000000000000..f3aec3d79824
--- /dev/null
+++ b/include/linux/bpf-netns.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_NETNS_H
+#define _BPF_NETNS_H
+
+#include <linux/mutex.h>
+#include <uapi/linux/bpf.h>
+
+enum netns_bpf_attach_type {
+	NETNS_BPF_INVALID = -1,
+	NETNS_BPF_FLOW_DISSECTOR = 0,
+	MAX_NETNS_BPF_ATTACH_TYPE
+};
+
+static inline enum netns_bpf_attach_type
+to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
+{
+	switch (attach_type) {
+	case BPF_FLOW_DISSECTOR:
+		return NETNS_BPF_FLOW_DISSECTOR;
+	default:
+		return NETNS_BPF_INVALID;
+	}
+}
+
+/* Protects updates to netns_bpf */
+extern struct mutex netns_bpf_mutex;
+
+union bpf_attr;
+struct bpf_prog;
+
+#ifdef CONFIG_NET
+int netns_bpf_prog_query(const union bpf_attr *attr,
+			 union bpf_attr __user *uattr);
+int netns_bpf_prog_attach(const union bpf_attr *attr,
+			  struct bpf_prog *prog);
+int netns_bpf_prog_detach(const union bpf_attr *attr);
+#else
+static inline int netns_bpf_prog_query(const union bpf_attr *attr,
+				       union bpf_attr __user *uattr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int netns_bpf_prog_attach(const union bpf_attr *attr,
+					struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int netns_bpf_prog_detach(const union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif /* _BPF_NETNS_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 531843952809..a0d5c2760103 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
-#ifdef CONFIG_NET
-int skb_flow_dissector_prog_query(const union bpf_attr *attr,
-				  union bpf_attr __user *uattr);
-int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
-				       struct bpf_prog *prog);
-
-int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
-#else
-static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr,
-						union bpf_attr __user *uattr)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
-						     struct bpf_prog *prog)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
 struct bpf_flow_dissector;
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 		      __be16 proto, int nhoff, int hlen, unsigned int flags);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 8e001e049497..2ee5901bec7a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -33,6 +33,7 @@
 #include <net/netns/mpls.h>
 #include <net/netns/can.h>
 #include <net/netns/xdp.h>
+#include <net/netns/bpf.h>
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
@@ -162,7 +163,8 @@ struct net {
 #endif
 	struct net_generic __rcu	*gen;
 
-	struct bpf_prog __rcu	*flow_dissector_prog;
+	/* Used to store attached BPF programs */
+	struct netns_bpf	bpf;
 
 	/* Note : following structs are cache line aligned */
 #ifdef CONFIG_XFRM
diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h
new file mode 100644
index 000000000000..a858d1c5b166
--- /dev/null
+++ b/include/net/netns/bpf.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF programs attached to network namespace
+ */
+
+#ifndef __NETNS_BPF_H__
+#define __NETNS_BPF_H__
+
+#include <linux/bpf-netns.h>
+
+struct bpf_prog;
+
+struct netns_bpf {
+	struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE];
+};
+
+#endif /* __NETNS_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e83b0818b529..c77ab9c76f7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,7 @@
 #include <uapi/linux/btf.h>
 #include <linux/bpf_lsm.h>
 #include <linux/poll.h>
+#include <linux/bpf-netns.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		ret = lirc_prog_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
-		ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+		ret = netns_bpf_prog_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
-		return skb_flow_dissector_bpf_prog_detach(attr);
+		return netns_bpf_prog_detach(attr);
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
 	case BPF_FLOW_DISSECTOR:
-		return skb_flow_dissector_prog_query(attr, uattr);
+		return netns_bpf_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b64a44a083fd..6c1b8e43d611 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -31,8 +31,10 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #endif
+#include <linux/bpf-netns.h>
 
-static DEFINE_MUTEX(flow_dissector_mutex);
+/* Protects updates to netns_bpf */
+DEFINE_MUTEX(netns_bpf_mutex);
 
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
@@ -70,23 +72,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
-int skb_flow_dissector_prog_query(const union bpf_attr *attr,
-				  union bpf_attr __user *uattr)
+int netns_bpf_prog_query(const union bpf_attr *attr,
+			 union bpf_attr __user *uattr)
 {
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	u32 prog_id, prog_cnt = 0, flags = 0;
+	enum netns_bpf_attach_type type;
 	struct bpf_prog *attached;
 	struct net *net;
 
 	if (attr->query.query_flags)
 		return -EINVAL;
 
+	type = to_netns_bpf_attach_type(attr->query.attach_type);
+	if (type < 0)
+		return -EINVAL;
+
 	net = get_net_ns_by_fd(attr->query.target_fd);
 	if (IS_ERR(net))
 		return PTR_ERR(net);
 
 	rcu_read_lock();
-	attached = rcu_dereference(net->flow_dissector_prog);
+	attached = rcu_dereference(net->bpf.progs[type]);
 	if (attached) {
 		prog_cnt = 1;
 		prog_id = attached->aux->id;
@@ -112,6 +119,7 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr,
 static int flow_dissector_bpf_prog_attach(struct net *net,
 					  struct bpf_prog *prog)
 {
+	enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
 	struct bpf_prog *attached;
 
 	if (net == &init_net) {
@@ -125,74 +133,97 @@ static int flow_dissector_bpf_prog_attach(struct net *net,
 		for_each_net(ns) {
 			if (ns == &init_net)
 				continue;
-			if (rcu_access_pointer(ns->flow_dissector_prog))
+			if (rcu_access_pointer(ns->bpf.progs[type]))
 				return -EEXIST;
 		}
 	} else {
 		/* Make sure root flow dissector is not attached
 		 * when attaching to the non-root namespace.
 		 */
-		if (rcu_access_pointer(init_net.flow_dissector_prog))
+		if (rcu_access_pointer(init_net.bpf.progs[type]))
 			return -EEXIST;
 	}
 
-	attached = rcu_dereference_protected(net->flow_dissector_prog,
-					     lockdep_is_held(&flow_dissector_mutex));
+	attached = rcu_dereference_protected(net->bpf.progs[type],
+					     lockdep_is_held(&netns_bpf_mutex));
 	if (attached == prog)
 		/* The same program cannot be attached twice */
 		return -EINVAL;
 
-	rcu_assign_pointer(net->flow_dissector_prog, prog);
+	rcu_assign_pointer(net->bpf.progs[type], prog);
 	if (attached)
 		bpf_prog_put(attached);
 	return 0;
 }
 
-int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
-				       struct bpf_prog *prog)
+int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
+	enum netns_bpf_attach_type type;
+	struct net *net;
 	int ret;
 
-	mutex_lock(&flow_dissector_mutex);
-	ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog);
-	mutex_unlock(&flow_dissector_mutex);
+	type = to_netns_bpf_attach_type(attr->attach_type);
+	if (type < 0)
+		return -EINVAL;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&netns_bpf_mutex);
+	switch (type) {
+	case NETNS_BPF_FLOW_DISSECTOR:
+		ret = flow_dissector_bpf_prog_attach(net, prog);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	mutex_unlock(&netns_bpf_mutex);
 
 	return ret;
 }
 
-static int flow_dissector_bpf_prog_detach(struct net *net)
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_detach(struct net *net,
+				   enum netns_bpf_attach_type type)
 {
 	struct bpf_prog *attached;
 
-	mutex_lock(&flow_dissector_mutex);
-	attached = rcu_dereference_protected(net->flow_dissector_prog,
-					     lockdep_is_held(&flow_dissector_mutex));
-	if (!attached) {
-		mutex_unlock(&flow_dissector_mutex);
+	attached = rcu_dereference_protected(net->bpf.progs[type],
+					     lockdep_is_held(&netns_bpf_mutex));
+	if (!attached)
 		return -ENOENT;
-	}
-	RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+	RCU_INIT_POINTER(net->bpf.progs[type], NULL);
 	bpf_prog_put(attached);
-	mutex_unlock(&flow_dissector_mutex);
 	return 0;
 }
 
-int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+int netns_bpf_prog_detach(const union bpf_attr *attr)
 {
-	return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns);
+	enum netns_bpf_attach_type type;
+	int ret;
+
+	type = to_netns_bpf_attach_type(attr->attach_type);
+	if (type < 0)
+		return -EINVAL;
+
+	mutex_lock(&netns_bpf_mutex);
+	ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
+	mutex_unlock(&netns_bpf_mutex);
+
+	return ret;
 }
 
-static void __net_exit flow_dissector_pernet_pre_exit(struct net *net)
+static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
 {
-	/* We're not racing with attach/detach because there are no
-	 * references to netns left when pre_exit gets called.
-	 */
-	if (rcu_access_pointer(net->flow_dissector_prog))
-		flow_dissector_bpf_prog_detach(net);
+	enum netns_bpf_attach_type type;
+
+	mutex_lock(&netns_bpf_mutex);
+	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++)
+		__netns_bpf_prog_detach(net, type);
+	mutex_unlock(&netns_bpf_mutex);
 }
 
-static struct pernet_operations flow_dissector_pernet_ops __net_initdata = {
-	.pre_exit = flow_dissector_pernet_pre_exit,
+static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
+	.pre_exit = netns_bpf_pernet_pre_exit,
 };
 
 /**
@@ -1044,11 +1075,13 @@ bool __skb_flow_dissect(const struct net *net,
 
 	WARN_ON_ONCE(!net);
 	if (net) {
+		enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+
 		rcu_read_lock();
-		attached = rcu_dereference(init_net.flow_dissector_prog);
+		attached = rcu_dereference(init_net.bpf.progs[type]);
 
 		if (!attached)
-			attached = rcu_dereference(net->flow_dissector_prog);
+			attached = rcu_dereference(net->bpf.progs[type]);
 
 		if (attached) {
 			struct bpf_flow_keys flow_keys;
@@ -1870,6 +1903,6 @@ static int __init init_default_flow_dissectors(void)
 				flow_keys_basic_dissector_keys,
 				ARRAY_SIZE(flow_keys_basic_dissector_keys));
 
-	return register_pernet_subsys(&flow_dissector_pernet_ops);
+	return register_pernet_subsys(&netns_bpf_pernet_ops);
 }
 core_initcall(init_default_flow_dissectors);
-- 
cgit v1.2.3


From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Sun, 31 May 2020 10:28:37 +0200
Subject: flow_dissector: Move out netns_bpf prog callbacks

Move functions to manage BPF programs attached to netns that are not
specific to flow dissector to a dedicated module named
bpf/net_namespace.c.

The set of functions will grow with the addition of bpf_link support for
netns attached programs. This patch prepares ground by creating a place
for it.

This is a code move with no functional changes intended.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com
---
 include/net/flow_dissector.h |   6 ++
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/net_namespace.c   | 133 +++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 125 ++--------------------------------------
 4 files changed, 144 insertions(+), 121 deletions(-)
 create mode 100644 kernel/bpf/net_namespace.c

(limited to 'net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4fb1a69c6ecf..a7eba43fe4e4 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -8,6 +8,8 @@
 #include <linux/string.h>
 #include <uapi/linux/if_ether.h>
 
+struct bpf_prog;
+struct net;
 struct sk_buff;
 
 /**
@@ -369,4 +371,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
 	memset(key_basic, 0, sizeof(*key_basic));
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog);
+#endif /* CONFIG_BPF_SYSCALL */
+
 #endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 8fca02f64811..1131a921e1a6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
+obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
new file mode 100644
index 000000000000..b37d81450c3a
--- /dev/null
+++ b/kernel/bpf/net_namespace.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/net_namespace.h>
+
+/*
+ * Functions to manage BPF programs attached to netns
+ */
+
+/* Protects updates to netns_bpf */
+DEFINE_MUTEX(netns_bpf_mutex);
+
+int netns_bpf_prog_query(const union bpf_attr *attr,
+			 union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	u32 prog_id, prog_cnt = 0, flags = 0;
+	enum netns_bpf_attach_type type;
+	struct bpf_prog *attached;
+	struct net *net;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	type = to_netns_bpf_attach_type(attr->query.attach_type);
+	if (type < 0)
+		return -EINVAL;
+
+	net = get_net_ns_by_fd(attr->query.target_fd);
+	if (IS_ERR(net))
+		return PTR_ERR(net);
+
+	rcu_read_lock();
+	attached = rcu_dereference(net->bpf.progs[type]);
+	if (attached) {
+		prog_cnt = 1;
+		prog_id = attached->aux->id;
+	}
+	rcu_read_unlock();
+
+	put_net(net);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+		return -EFAULT;
+
+	if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+		return 0;
+
+	if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	enum netns_bpf_attach_type type;
+	struct net *net;
+	int ret;
+
+	type = to_netns_bpf_attach_type(attr->attach_type);
+	if (type < 0)
+		return -EINVAL;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&netns_bpf_mutex);
+	switch (type) {
+	case NETNS_BPF_FLOW_DISSECTOR:
+		ret = flow_dissector_bpf_prog_attach(net, prog);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	mutex_unlock(&netns_bpf_mutex);
+
+	return ret;
+}
+
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_detach(struct net *net,
+				   enum netns_bpf_attach_type type)
+{
+	struct bpf_prog *attached;
+
+	attached = rcu_dereference_protected(net->bpf.progs[type],
+					     lockdep_is_held(&netns_bpf_mutex));
+	if (!attached)
+		return -ENOENT;
+	RCU_INIT_POINTER(net->bpf.progs[type], NULL);
+	bpf_prog_put(attached);
+	return 0;
+}
+
+int netns_bpf_prog_detach(const union bpf_attr *attr)
+{
+	enum netns_bpf_attach_type type;
+	int ret;
+
+	type = to_netns_bpf_attach_type(attr->attach_type);
+	if (type < 0)
+		return -EINVAL;
+
+	mutex_lock(&netns_bpf_mutex);
+	ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
+	mutex_unlock(&netns_bpf_mutex);
+
+	return ret;
+}
+
+static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
+{
+	enum netns_bpf_attach_type type;
+
+	mutex_lock(&netns_bpf_mutex);
+	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++)
+		__netns_bpf_prog_detach(net, type);
+	mutex_unlock(&netns_bpf_mutex);
+}
+
+static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
+	.pre_exit = netns_bpf_pernet_pre_exit,
+};
+
+static int __init netns_bpf_init(void)
+{
+	return register_pernet_subsys(&netns_bpf_pernet_ops);
+}
+
+subsys_initcall(netns_bpf_init);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6c1b8e43d611..d02df0b6d0d9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -33,9 +33,6 @@
 #endif
 #include <linux/bpf-netns.h>
 
-/* Protects updates to netns_bpf */
-DEFINE_MUTEX(netns_bpf_mutex);
-
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
 {
@@ -72,52 +69,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
-int netns_bpf_prog_query(const union bpf_attr *attr,
-			 union bpf_attr __user *uattr)
-{
-	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
-	u32 prog_id, prog_cnt = 0, flags = 0;
-	enum netns_bpf_attach_type type;
-	struct bpf_prog *attached;
-	struct net *net;
-
-	if (attr->query.query_flags)
-		return -EINVAL;
-
-	type = to_netns_bpf_attach_type(attr->query.attach_type);
-	if (type < 0)
-		return -EINVAL;
-
-	net = get_net_ns_by_fd(attr->query.target_fd);
-	if (IS_ERR(net))
-		return PTR_ERR(net);
-
-	rcu_read_lock();
-	attached = rcu_dereference(net->bpf.progs[type]);
-	if (attached) {
-		prog_cnt = 1;
-		prog_id = attached->aux->id;
-	}
-	rcu_read_unlock();
-
-	put_net(net);
-
-	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
-		return -EFAULT;
-	if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
-		return -EFAULT;
-
-	if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
-		return 0;
-
-	if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int flow_dissector_bpf_prog_attach(struct net *net,
-					  struct bpf_prog *prog)
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog)
 {
 	enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
 	struct bpf_prog *attached;
@@ -155,76 +108,7 @@ static int flow_dissector_bpf_prog_attach(struct net *net,
 		bpf_prog_put(attached);
 	return 0;
 }
-
-int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
-{
-	enum netns_bpf_attach_type type;
-	struct net *net;
-	int ret;
-
-	type = to_netns_bpf_attach_type(attr->attach_type);
-	if (type < 0)
-		return -EINVAL;
-
-	net = current->nsproxy->net_ns;
-	mutex_lock(&netns_bpf_mutex);
-	switch (type) {
-	case NETNS_BPF_FLOW_DISSECTOR:
-		ret = flow_dissector_bpf_prog_attach(net, prog);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	mutex_unlock(&netns_bpf_mutex);
-
-	return ret;
-}
-
-/* Must be called with netns_bpf_mutex held. */
-static int __netns_bpf_prog_detach(struct net *net,
-				   enum netns_bpf_attach_type type)
-{
-	struct bpf_prog *attached;
-
-	attached = rcu_dereference_protected(net->bpf.progs[type],
-					     lockdep_is_held(&netns_bpf_mutex));
-	if (!attached)
-		return -ENOENT;
-	RCU_INIT_POINTER(net->bpf.progs[type], NULL);
-	bpf_prog_put(attached);
-	return 0;
-}
-
-int netns_bpf_prog_detach(const union bpf_attr *attr)
-{
-	enum netns_bpf_attach_type type;
-	int ret;
-
-	type = to_netns_bpf_attach_type(attr->attach_type);
-	if (type < 0)
-		return -EINVAL;
-
-	mutex_lock(&netns_bpf_mutex);
-	ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
-	mutex_unlock(&netns_bpf_mutex);
-
-	return ret;
-}
-
-static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
-{
-	enum netns_bpf_attach_type type;
-
-	mutex_lock(&netns_bpf_mutex);
-	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++)
-		__netns_bpf_prog_detach(net, type);
-	mutex_unlock(&netns_bpf_mutex);
-}
-
-static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
-	.pre_exit = netns_bpf_pernet_pre_exit,
-};
+#endif /* CONFIG_BPF_SYSCALL */
 
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
@@ -1902,7 +1786,6 @@ static int __init init_default_flow_dissectors(void)
 	skb_flow_dissector_init(&flow_keys_basic_dissector,
 				flow_keys_basic_dissector_keys,
 				ARRAY_SIZE(flow_keys_basic_dissector_keys));
-
-	return register_pernet_subsys(&netns_bpf_pernet_ops);
+	return 0;
 }
 core_initcall(init_default_flow_dissectors);
-- 
cgit v1.2.3


From 4c21daae3dbc9f8536cc18e6e53627821fa2c90c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 28 May 2020 22:34:07 +0800
Subject: tipc: Fix NULL pointer dereference in __tipc_sendstream()

tipc_sendstream() may send zero length packet, then tipc_msg_append()
do not alloc skb, skb_peek_tail() will get NULL, msg_set_ack_required
will trigger NULL pointer dereference.

Reported-by: syzbot+8eac6d030e7807c21d32@syzkaller.appspotmail.com
Fixes: 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3734cdbedc9c..26123f4177fd 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1588,8 +1588,12 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 				tsk->pkt_cnt += skb_queue_len(txq);
 			} else {
 				skb = skb_peek_tail(txq);
-				msg_set_ack_required(buf_msg(skb));
-				tsk->expect_ack = true;
+				if (skb) {
+					msg_set_ack_required(buf_msg(skb));
+					tsk->expect_ack = true;
+				} else {
+					tsk->expect_ack = false;
+				}
 				tsk->msg_acc = 0;
 				tsk->pkt_cnt = 0;
 			}
-- 
cgit v1.2.3


From 79a1f0ccdbb4ad700590f61b00525b390cb53905 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 1 Jun 2020 11:55:03 +0800
Subject: ipv6: fix IPV6_ADDRFORM operation logic

Socket option IPV6_ADDRFORM supports UDP/UDPLITE and TCP at present.
Previously the checking logic looks like:
if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE)
	do_some_check;
else if (sk->sk_protocol != IPPROTO_TCP)
	break;

After commit b6f6118901d1 ("ipv6: restrict IPV6_ADDRFORM operation"), TCP
was blocked as the logic changed to:
if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE)
	do_some_check;
else if (sk->sk_protocol == IPPROTO_TCP)
	do_some_check;
	break;
else
	break;

Then after commit 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation")
UDP/UDPLITE were blocked as the logic changed to:
if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE)
	do_some_check;
if (sk->sk_protocol == IPPROTO_TCP)
	do_some_check;

if (sk->sk_protocol != IPPROTO_TCP)
	break;

Fix it by using Eric's code and simply remove the break in TCP check, which
looks like:
if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE)
	do_some_check;
else if (sk->sk_protocol == IPPROTO_TCP)
	do_some_check;
else
	break;

Fixes: 82c9ae440857 ("ipv6: fix restrict IPV6_ADDRFORM operation")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index adbfed6adf11..2c843ff5e3a9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -218,14 +218,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 					retv = -EBUSY;
 					break;
 				}
-			}
-			if (sk->sk_protocol == IPPROTO_TCP &&
-			    sk->sk_prot != &tcpv6_prot) {
-				retv = -EBUSY;
+			} else if (sk->sk_protocol == IPPROTO_TCP) {
+				if (sk->sk_prot != &tcpv6_prot) {
+					retv = -EBUSY;
+					break;
+				}
+			} else {
 				break;
 			}
-			if (sk->sk_protocol != IPPROTO_TCP)
-				break;
+
 			if (sk->sk_state != TCP_ESTABLISHED) {
 				retv = -ENOTCONN;
 				break;
-- 
cgit v1.2.3


From 6abde0b241224347cd88e2ae75902e07f55c42cb Mon Sep 17 00:00:00 2001
From: Vinay Kumar Yadav <vinay.yadav@chelsio.com>
Date: Tue, 2 Jun 2020 00:07:05 +0530
Subject: crypto/chtls: IPv6 support for inline TLS

Extends support to IPv6 for Inline TLS server.

Signed-off-by: Vinay Kumar Yadav <vinay.yadav@chelsio.com>

v1->v2:
- cc'd tcp folks.

v2->v3:
- changed EXPORT_SYMBOL() to EXPORT_SYMBOL_GPL()

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_cm.c   | 195 ++++++++++++++++++++++++------
 drivers/crypto/chelsio/chtls/chtls_cm.h   |   1 +
 drivers/crypto/chelsio/chtls/chtls_main.c |  14 ++-
 net/ipv6/tcp_ipv6.c                       |   1 +
 4 files changed, 168 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index d5720a859443..9a642c79a657 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -18,13 +18,20 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/if_vlan.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
 #include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/dst.h>
 #include <net/tls.h>
+#include <net/addrconf.h>
+#include <net/secure_seq.h>
 
 #include "chtls.h"
 #include "chtls_cm.h"
+#include "clip_tbl.h"
 
 /*
  * State transitions and actions for close.  Note that if we are in SYN_SENT
@@ -82,15 +89,36 @@ static void chtls_sock_release(struct kref *ref)
 	kfree(csk);
 }
 
-static struct net_device *chtls_ipv4_netdev(struct chtls_dev *cdev,
+static struct net_device *chtls_find_netdev(struct chtls_dev *cdev,
 					    struct sock *sk)
 {
 	struct net_device *ndev = cdev->ports[0];
+	struct net_device *temp;
+	int addr_type;
+
+	switch (sk->sk_family) {
+	case PF_INET:
+		if (likely(!inet_sk(sk)->inet_rcv_saddr))
+			return ndev;
+		ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr);
+		break;
+	case PF_INET6:
+		addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+		if (likely(addr_type == IPV6_ADDR_ANY))
+			return ndev;
+
+	for_each_netdev_rcu(&init_net, temp) {
+		if (ipv6_chk_addr(&init_net, (struct in6_addr *)
+				  &sk->sk_v6_rcv_saddr, temp, 1)) {
+			ndev = temp;
+			break;
+		}
+	}
+	break;
+	default:
+		return NULL;
+	}
 
-	if (likely(!inet_sk(sk)->inet_rcv_saddr))
-		return ndev;
-
-	ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr);
 	if (!ndev)
 		return NULL;
 
@@ -446,7 +474,10 @@ void chtls_destroy_sock(struct sock *sk)
 	free_tls_keyid(sk);
 	kref_put(&csk->kref, chtls_sock_release);
 	csk->cdev = NULL;
-	sk->sk_prot = &tcp_prot;
+	if (sk->sk_family == AF_INET)
+		sk->sk_prot = &tcp_prot;
+	else
+		sk->sk_prot = &tcpv6_prot;
 	sk->sk_prot->destroy(sk);
 }
 
@@ -473,7 +504,8 @@ static void chtls_disconnect_acceptq(struct sock *listen_sk)
 	while (*pprev) {
 		struct request_sock *req = *pprev;
 
-		if (req->rsk_ops == &chtls_rsk_ops) {
+		if (req->rsk_ops == &chtls_rsk_ops ||
+		    req->rsk_ops == &chtls_rsk_opsv6) {
 			struct sock *child = req->sk;
 
 			*pprev = req->dl_next;
@@ -600,14 +632,13 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk)
 	struct listen_ctx *ctx;
 	struct adapter *adap;
 	struct port_info *pi;
+	bool clip_valid;
 	int stid;
 	int ret;
 
-	if (sk->sk_family != PF_INET)
-		return -EAGAIN;
-
+	clip_valid = false;
 	rcu_read_lock();
-	ndev = chtls_ipv4_netdev(cdev, sk);
+	ndev = chtls_find_netdev(cdev, sk);
 	rcu_read_unlock();
 	if (!ndev)
 		return -EBADF;
@@ -638,16 +669,35 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk)
 	if (!listen_hash_add(cdev, sk, stid))
 		goto free_stid;
 
-	ret = cxgb4_create_server(ndev, stid,
-				  inet_sk(sk)->inet_rcv_saddr,
-				  inet_sk(sk)->inet_sport, 0,
-				  cdev->lldi->rxq_ids[0]);
+	if (sk->sk_family == PF_INET) {
+		ret = cxgb4_create_server(ndev, stid,
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_sport, 0,
+					  cdev->lldi->rxq_ids[0]);
+	} else {
+		int addr_type;
+
+		addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+		if (addr_type != IPV6_ADDR_ANY) {
+			ret = cxgb4_clip_get(ndev, (const u32 *)
+					     &sk->sk_v6_rcv_saddr, 1);
+			if (ret)
+				goto del_hash;
+			clip_valid = true;
+		}
+		ret = cxgb4_create_server6(ndev, stid,
+					   &sk->sk_v6_rcv_saddr,
+					   inet_sk(sk)->inet_sport,
+					   cdev->lldi->rxq_ids[0]);
+	}
 	if (ret > 0)
 		ret = net_xmit_errno(ret);
 	if (ret)
 		goto del_hash;
 	return 0;
 del_hash:
+	if (clip_valid)
+		cxgb4_clip_release(ndev, (const u32 *)&sk->sk_v6_rcv_saddr, 1);
 	listen_hash_del(cdev, sk);
 free_stid:
 	cxgb4_free_stid(cdev->tids, stid, sk->sk_family);
@@ -661,6 +711,8 @@ free_ctx:
 void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk)
 {
 	struct listen_ctx *listen_ctx;
+	struct chtls_sock *csk;
+	int addr_type = 0;
 	int stid;
 
 	stid = listen_hash_del(cdev, sk);
@@ -671,7 +723,16 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk)
 	chtls_reset_synq(listen_ctx);
 
 	cxgb4_remove_server(cdev->lldi->ports[0], stid,
-			    cdev->lldi->rxq_ids[0], 0);
+			    cdev->lldi->rxq_ids[0], sk->sk_family == PF_INET6);
+
+	if (sk->sk_family == PF_INET6) {
+		csk = rcu_dereference_sk_user_data(sk);
+		addr_type = ipv6_addr_type((const struct in6_addr *)
+					  &sk->sk_v6_rcv_saddr);
+		if (addr_type != IPV6_ADDR_ANY)
+			cxgb4_clip_release(csk->egress_dev, (const u32 *)
+					   &sk->sk_v6_rcv_saddr, 1);
+	}
 	chtls_disconnect_acceptq(sk);
 }
 
@@ -880,7 +941,10 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk,
 	tp = tcp_sk(sk);
 	tcpoptsz = 0;
 
-	iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr);
+	if (sk->sk_family == AF_INET6)
+		iphdrsz = sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+	else
+		iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr);
 	if (req->tcpopt.tstamp)
 		tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4);
 
@@ -1045,11 +1109,29 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
 	if (!newsk)
 		goto free_oreq;
 
-	dst = inet_csk_route_child_sock(lsk, newsk, oreq);
-	if (!dst)
-		goto free_sk;
+	if (lsk->sk_family == AF_INET) {
+		dst = inet_csk_route_child_sock(lsk, newsk, oreq);
+		if (!dst)
+			goto free_sk;
 
-	n = dst_neigh_lookup(dst, &iph->saddr);
+		n = dst_neigh_lookup(dst, &iph->saddr);
+	} else {
+		const struct ipv6hdr *ip6h;
+		struct flowi6 fl6;
+
+		ip6h = (const struct ipv6hdr *)network_hdr;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_TCP;
+		fl6.saddr = ip6h->daddr;
+		fl6.daddr = ip6h->saddr;
+		fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port;
+		fl6.fl6_sport = htons(inet_rsk(oreq)->ir_num);
+		security_req_classify_flow(oreq, flowi6_to_flowi(&fl6));
+		dst = ip6_dst_lookup_flow(sock_net(lsk), lsk, &fl6, NULL);
+		if (IS_ERR(dst))
+			goto free_sk;
+		n = dst_neigh_lookup(dst, &ip6h->saddr);
+	}
 	if (!n)
 		goto free_sk;
 
@@ -1072,9 +1154,28 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
 	tp = tcp_sk(newsk);
 	newinet = inet_sk(newsk);
 
-	newinet->inet_daddr = iph->saddr;
-	newinet->inet_rcv_saddr = iph->daddr;
-	newinet->inet_saddr = iph->daddr;
+	if (iph->version == 0x4) {
+		newinet->inet_daddr = iph->saddr;
+		newinet->inet_rcv_saddr = iph->daddr;
+		newinet->inet_saddr = iph->daddr;
+	} else {
+		struct tcp6_sock *newtcp6sk = (struct tcp6_sock *)newsk;
+		struct inet_request_sock *treq = inet_rsk(oreq);
+		struct ipv6_pinfo *newnp = inet6_sk(newsk);
+		struct ipv6_pinfo *np = inet6_sk(lsk);
+
+		inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+		newsk->sk_v6_daddr = treq->ir_v6_rmt_addr;
+		newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr;
+		inet6_sk(newsk)->saddr = treq->ir_v6_loc_addr;
+		newnp->ipv6_fl_list = NULL;
+		newnp->pktoptions = NULL;
+		newsk->sk_bound_dev_if = treq->ir_iif;
+		newinet->inet_opt = NULL;
+		newinet->inet_daddr = LOOPBACK4_IPV6;
+		newinet->inet_saddr = LOOPBACK4_IPV6;
+	}
 
 	oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid));
 	sk_setup_caps(newsk, dst);
@@ -1156,6 +1257,7 @@ static void chtls_pass_accept_request(struct sock *sk,
 	struct sk_buff *reply_skb;
 	struct chtls_sock *csk;
 	struct chtls_dev *cdev;
+	struct ipv6hdr *ip6h;
 	struct tcphdr *tcph;
 	struct sock *newsk;
 	struct ethhdr *eh;
@@ -1196,37 +1298,50 @@ static void chtls_pass_accept_request(struct sock *sk,
 	if (sk_acceptq_is_full(sk))
 		goto reject;
 
-	oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true);
-	if (!oreq)
-		goto reject;
-
-	oreq->rsk_rcv_wnd = 0;
-	oreq->rsk_window_clamp = 0;
-	oreq->cookie_ts = 0;
-	oreq->mss = 0;
-	oreq->ts_recent = 0;
 
 	eth_hdr_len = T6_ETH_HDR_LEN_G(ntohl(req->hdr_len));
 	if (eth_hdr_len == ETH_HLEN) {
 		eh = (struct ethhdr *)(req + 1);
 		iph = (struct iphdr *)(eh + 1);
+		ip6h = (struct ipv6hdr *)(eh + 1);
 		network_hdr = (void *)(eh + 1);
 	} else {
 		vlan_eh = (struct vlan_ethhdr *)(req + 1);
 		iph = (struct iphdr *)(vlan_eh + 1);
+		ip6h = (struct ipv6hdr *)(vlan_eh + 1);
 		network_hdr = (void *)(vlan_eh + 1);
 	}
-	if (iph->version != 0x4)
-		goto free_oreq;
 
-	tcph = (struct tcphdr *)(iph + 1);
-	skb_set_network_header(skb, (void *)iph - (void *)req);
+	if (iph->version == 0x4) {
+		tcph = (struct tcphdr *)(iph + 1);
+		skb_set_network_header(skb, (void *)iph - (void *)req);
+		oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true);
+	} else {
+		tcph = (struct tcphdr *)(ip6h + 1);
+		skb_set_network_header(skb, (void *)ip6h - (void *)req);
+		oreq = inet_reqsk_alloc(&chtls_rsk_opsv6, sk, false);
+	}
+
+	if (!oreq)
+		goto reject;
+
+	oreq->rsk_rcv_wnd = 0;
+	oreq->rsk_window_clamp = 0;
+	oreq->cookie_ts = 0;
+	oreq->mss = 0;
+	oreq->ts_recent = 0;
 
 	tcp_rsk(oreq)->tfo_listener = false;
 	tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq);
 	chtls_set_req_port(oreq, tcph->source, tcph->dest);
-	chtls_set_req_addr(oreq, iph->daddr, iph->saddr);
-	ip_dsfield = ipv4_get_dsfield(iph);
+	if (iph->version == 0x4) {
+		chtls_set_req_addr(oreq, iph->daddr, iph->saddr);
+		ip_dsfield = ipv4_get_dsfield(iph);
+	} else {
+		inet_rsk(oreq)->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+		inet_rsk(oreq)->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+		ip_dsfield = ipv6_get_dsfield(ipv6_hdr(skb));
+	}
 	if (req->tcpopt.wsf <= 14 &&
 	    sock_net(sk)->ipv4.sysctl_tcp_window_scaling) {
 		inet_rsk(oreq)->wscale_ok = 1;
@@ -1243,7 +1358,7 @@ static void chtls_pass_accept_request(struct sock *sk,
 
 	newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev);
 	if (!newsk)
-		goto reject;
+		goto free_oreq;
 
 	if (chtls_get_module(newsk))
 		goto reject;
diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.h b/drivers/crypto/chelsio/chtls/chtls_cm.h
index 3fac0c74a41f..47ba81e42f5d 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.h
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.h
@@ -79,6 +79,7 @@ enum {
 
 typedef void (*defer_handler_t)(struct chtls_dev *dev, struct sk_buff *skb);
 extern struct request_sock_ops chtls_rsk_ops;
+extern struct request_sock_ops chtls_rsk_opsv6;
 
 struct deferred_skb_cb {
 	defer_handler_t handler;
diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index 2110d0893bc7..7dfffdde9593 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -13,6 +13,8 @@
 #include <linux/net.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
 #include <net/tcp.h>
 #include <net/tls.h>
 
@@ -30,8 +32,8 @@ static DEFINE_MUTEX(cdev_mutex);
 
 static DEFINE_MUTEX(notify_mutex);
 static RAW_NOTIFIER_HEAD(listen_notify_list);
-static struct proto chtls_cpl_prot;
-struct request_sock_ops chtls_rsk_ops;
+static struct proto chtls_cpl_prot, chtls_cpl_protv6;
+struct request_sock_ops chtls_rsk_ops, chtls_rsk_opsv6;
 static uint send_page_order = (14 - PAGE_SHIFT < 0) ? 0 : 14 - PAGE_SHIFT;
 
 static void register_listen_notifier(struct notifier_block *nb)
@@ -586,7 +588,10 @@ static struct cxgb4_uld_info chtls_uld_info = {
 
 void chtls_install_cpl_ops(struct sock *sk)
 {
-	sk->sk_prot = &chtls_cpl_prot;
+	if (sk->sk_family == AF_INET)
+		sk->sk_prot = &chtls_cpl_prot;
+	else
+		sk->sk_prot = &chtls_cpl_protv6;
 }
 
 static void __init chtls_init_ulp_ops(void)
@@ -603,6 +608,9 @@ static void __init chtls_init_ulp_ops(void)
 	chtls_cpl_prot.recvmsg		= chtls_recvmsg;
 	chtls_cpl_prot.setsockopt	= chtls_setsockopt;
 	chtls_cpl_prot.getsockopt	= chtls_getsockopt;
+	chtls_cpl_protv6		= chtls_cpl_prot;
+	chtls_init_rsk_ops(&chtls_cpl_protv6, &chtls_rsk_opsv6,
+			   &tcpv6_prot, PF_INET6);
 }
 
 static int __init chtls_register(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b7415ca75c2d..f67d45ff00b4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
 #endif
 	.diag_destroy		= tcp_abort,
 };
+EXPORT_SYMBOL_GPL(tcpv6_prot);
 
 /* thinking of making this const? Don't.
  * early_demux can change based on sysctl.
-- 
cgit v1.2.3


From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 2 Jun 2020 16:58:32 +0200
Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting

Lorenz recently reported:

  In our TC classifier cls_redirect [0], we use the following sequence of
  helper calls to decapsulate a GUE (basically IP + UDP + custom header)
  encapsulated packet:

    bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO)
    bpf_redirect(skb->ifindex, BPF_F_INGRESS)

  It seems like some checksums of the inner headers are not validated in
  this case. For example, a TCP SYN packet with invalid TCP checksum is
  still accepted by the network stack and elicits a SYN ACK. [...]

  That is, we receive the following packet from the driver:

    | ETH | IP | UDP | GUE | IP | TCP |
    skb->ip_summed == CHECKSUM_UNNECESSARY

  ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading.
  On this packet we run skb_adjust_room_mac(-encap_len), and get the following:

    | ETH | IP | TCP |
    skb->ip_summed == CHECKSUM_UNNECESSARY

  Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing
  into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is
  turned into a no-op due to CHECKSUM_UNNECESSARY.

The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally,
it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does
not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original
skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now
there is no way to adjust the skb->csum_level. NICs that have checksum offload
disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected.

Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and
add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users
from opting out. Opting out is useful for the case where we don't remove/add
full protocol headers, or for the case where a user wants to adjust the csum
level manually e.g. through bpf_csum_level() helper that is added in subsequent
patch.

The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF
bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally
as well but doesn't change layers, only transitions between v4 to v6 and vice
versa, therefore no adoption is required there.

  [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/

Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper")
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Reported-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/
Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net
---
 include/linux/skbuff.h         | 8 ++++++++
 include/uapi/linux/bpf.h       | 8 ++++++++
 net/core/filter.c              | 8 ++++++--
 tools/include/uapi/linux/bpf.h | 8 ++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a0d5c2760103..0c0377fc00c2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
 	}
 }
 
+static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		skb->ip_summed = CHECKSUM_NONE;
+		skb->csum_level = 0;
+	}
+}
+
 /* Check if we need to perform checksum complete validation.
  *
  * Returns true if checksum complete is needed, false otherwise
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9ed9f14f2a2..3ba2bbbed80c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1635,6 +1635,13 @@ union bpf_attr {
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
  *
+ * 		By default, the helper will reset any offloaded checksum
+ * 		indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * 		by the following flag:
+ *
+ * 		* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * 		  checksum data of the skb to CHECKSUM_NONE.
+ *
  *		There are two supported modes at this time:
  *
  *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
@@ -3433,6 +3440,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	= (1ULL << 2),
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
+	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index ae82bcb03124..278dcc0af961 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
 {
 	int ret;
 
-	if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO)
+	if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+			       BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
 		return -EINVAL;
 
 	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
@@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 	u32 off;
 	int ret;
 
-	if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
+	if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
+			       BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
 		return -EINVAL;
 	if (unlikely(len_diff_abs > 0xfffU))
 		return -EFAULT;
@@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 
 	ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
 		       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
+	if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
+		__skb_reset_checksum_unnecessary(skb);
 
 	bpf_compute_data_pointers(skb);
 	return ret;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b9ed9f14f2a2..3ba2bbbed80c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1635,6 +1635,13 @@ union bpf_attr {
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
  *
+ * 		By default, the helper will reset any offloaded checksum
+ * 		indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * 		by the following flag:
+ *
+ * 		* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * 		  checksum data of the skb to CHECKSUM_NONE.
+ *
  *		There are two supported modes at this time:
  *
  *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
@@ -3433,6 +3440,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	= (1ULL << 2),
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
+	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 };
 
 enum {
-- 
cgit v1.2.3


From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 2 Jun 2020 16:58:33 +0200
Subject: bpf: Add csum_level helper for fixing up csum levels

Add a bpf_csum_level() helper which BPF programs can use in combination
with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET
flag to the latter to avoid falling back to CHECKSUM_NONE.

The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels
via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary()
on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's
csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the
current level. Without this helper, there is no way to otherwise adjust the
skb->csum_level. I did not add an extra dummy flags as there is plenty of free
bitspace in level argument itself iff ever needed in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Lorenz Bauer <lmb@cloudflare.com>
Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net
---
 include/uapi/linux/bpf.h       | 43 +++++++++++++++++++++++++++++++++++++++++-
 net/core/filter.c              | 38 +++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 122 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3ba2bbbed80c..c65b374a5090 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3220,6 +3220,38 @@ union bpf_attr {
  *		calculation.
  *	Return
  *		Requested value, or 0, if flags are not recognized.
+ *
+ * int bpf_csum_level(struct sk_buff *skb, u64 level)
+ * 	Description
+ * 		Change the skbs checksum level by one layer up or down, or
+ * 		reset it entirely to none in order to have the stack perform
+ * 		checksum validation. The level is applicable to the following
+ * 		protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * 		| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * 		through **bpf_skb_adjust_room**\ () helper with passing in
+ * 		**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
+ * 		to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * 		the UDP header is removed. Similarly, an encap of the latter
+ * 		into the former could be accompanied by a helper call to
+ * 		**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * 		skb is still intended to be processed in higher layers of the
+ * 		stack instead of just egressing at tc.
+ *
+ * 		There are three supported level settings at this time:
+ *
+ * 		* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * 		  sets CHECKSUM_NONE to force checksum validation by the stack.
+ * 		* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * 		  skb->csum_level.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure. In the
+ * 		case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * 		is returned or the error code -EACCES in case the skb is not
+ * 		subject to CHECKSUM_UNNECESSARY.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3356,7 +3388,8 @@ union bpf_attr {
 	FN(ringbuf_reserve),		\
 	FN(ringbuf_submit),		\
 	FN(ringbuf_discard),		\
-	FN(ringbuf_query),
+	FN(ringbuf_query),		\
+	FN(csum_level),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3433,6 +3466,14 @@ enum {
 	BPF_F_CURRENT_NETNS		= (-1L),
 };
 
+/* BPF_FUNC_csum_level level values. */
+enum {
+	BPF_CSUM_LEVEL_QUERY,
+	BPF_CSUM_LEVEL_INC,
+	BPF_CSUM_LEVEL_DEC,
+	BPF_CSUM_LEVEL_RESET,
+};
+
 /* BPF_FUNC_skb_adjust_room flags. */
 enum {
 	BPF_F_ADJ_ROOM_FIXED_GSO	= (1ULL << 0),
diff --git a/net/core/filter.c b/net/core/filter.c
index 278dcc0af961..d01a244b5087 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
+{
+	/* The interface is to be used in combination with bpf_skb_adjust_room()
+	 * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
+	 * is passed as flags, for example.
+	 */
+	switch (level) {
+	case BPF_CSUM_LEVEL_INC:
+		__skb_incr_checksum_unnecessary(skb);
+		break;
+	case BPF_CSUM_LEVEL_DEC:
+		__skb_decr_checksum_unnecessary(skb);
+		break;
+	case BPF_CSUM_LEVEL_RESET:
+		__skb_reset_checksum_unnecessary(skb);
+		break;
+	case BPF_CSUM_LEVEL_QUERY:
+		return skb->ip_summed == CHECKSUM_UNNECESSARY ?
+		       skb->csum_level : -EACCES;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_csum_level_proto = {
+	.func		= bpf_csum_level,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_csum_diff_proto;
 	case BPF_FUNC_csum_update:
 		return &bpf_csum_update_proto;
+	case BPF_FUNC_csum_level:
+		return &bpf_csum_level_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_csum_update:
 		return &bpf_csum_update_proto;
+	case BPF_FUNC_csum_level:
+		return &bpf_csum_level_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3ba2bbbed80c..c65b374a5090 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3220,6 +3220,38 @@ union bpf_attr {
  *		calculation.
  *	Return
  *		Requested value, or 0, if flags are not recognized.
+ *
+ * int bpf_csum_level(struct sk_buff *skb, u64 level)
+ * 	Description
+ * 		Change the skbs checksum level by one layer up or down, or
+ * 		reset it entirely to none in order to have the stack perform
+ * 		checksum validation. The level is applicable to the following
+ * 		protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * 		| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * 		through **bpf_skb_adjust_room**\ () helper with passing in
+ * 		**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
+ * 		to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * 		the UDP header is removed. Similarly, an encap of the latter
+ * 		into the former could be accompanied by a helper call to
+ * 		**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * 		skb is still intended to be processed in higher layers of the
+ * 		stack instead of just egressing at tc.
+ *
+ * 		There are three supported level settings at this time:
+ *
+ * 		* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * 		  sets CHECKSUM_NONE to force checksum validation by the stack.
+ * 		* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * 		  skb->csum_level.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure. In the
+ * 		case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * 		is returned or the error code -EACCES in case the skb is not
+ * 		subject to CHECKSUM_UNNECESSARY.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3356,7 +3388,8 @@ union bpf_attr {
 	FN(ringbuf_reserve),		\
 	FN(ringbuf_submit),		\
 	FN(ringbuf_discard),		\
-	FN(ringbuf_query),
+	FN(ringbuf_query),		\
+	FN(csum_level),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3433,6 +3466,14 @@ enum {
 	BPF_F_CURRENT_NETNS		= (-1L),
 };
 
+/* BPF_FUNC_csum_level level values. */
+enum {
+	BPF_CSUM_LEVEL_QUERY,
+	BPF_CSUM_LEVEL_INC,
+	BPF_CSUM_LEVEL_DEC,
+	BPF_CSUM_LEVEL_RESET,
+};
+
 /* BPF_FUNC_skb_adjust_room flags. */
 enum {
 	BPF_F_ADJ_ROOM_FIXED_GSO	= (1ULL << 0),
-- 
cgit v1.2.3


From 049fa17f7ae6b0971ad41b761479962facafea4b Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 2 Jun 2020 11:46:40 +0700
Subject: Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv"

This reverts commit de058420767df21e2b6b0f3bb36d1616fb962032.

There is no actual tipc_node refcnt leak as stated in the above commit.
The refcnt is hold carefully for the case of an asynchronous decryption
(i.e. -EINPROGRESS/-EBUSY and skb = NULL is returned), so that the node
object cannot be freed in the meantime. The counter will be re-balanced
when the operation's callback arrives with the decrypted buffer if any.
In other cases, e.g. a synchronous crypto the counter will be decreased
immediately when it is done.

Now with that commit, a kernel panic will occur when there is no node
found (i.e. n = NULL) in the 'tipc_rcv()' or a premature release of the
node object.

This commit solves the issues by reverting the said commit, but keeping
one valid case that the 'skb_linearize()' is failed.

Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Tested-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0312fb181d94..a4c2816c3746 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2038,7 +2038,6 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 		n = tipc_node_find_by_id(net, ehdr->id);
 	}
 	tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b);
-	tipc_node_put(n);
 	if (!skb)
 		return;
 
-- 
cgit v1.2.3


From a275727b1899d14d33d6c8c70f303b73f01cdbc6 Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Tue, 2 Jun 2020 11:46:41 +0700
Subject: Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv"

This reverts commit 441870ee4240cf67b5d3ab8e16216a9ff42eb5d6.

Like the previous patch in this series, we revert the above commit that
causes similar issues with the 'aead' object.

Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/crypto.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 8c47ded2edb6..c8c47fc72653 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -1712,7 +1712,6 @@ exit:
 	case -EBUSY:
 		this_cpu_inc(stats->stat[STAT_ASYNC]);
 		*skb = NULL;
-		tipc_aead_put(aead);
 		return rc;
 	default:
 		this_cpu_inc(stats->stat[STAT_NOK]);
-- 
cgit v1.2.3